This workflow fits a model across all of CONUS that predicts whether a location does or does not have trees.
The data consists of vegetation % cover by functional group from across CONUS (from AIM, FIA, LANDFIRE, and RAP), as well as climate variables from DayMet, which have been aggregated into mean interannual conditions accross multiple temporal windows.
Set user defined parameters
print(params)
## $run
## [1] FALSE
##
## $save_figs
## [1] TRUE
##
## $ecoregion
## [1] "CONUS"
##
## $response
## [1] "TotalTreeCover"
##
## $treeThreshold
## [1] 10
##
## $whichSecondBestMod
## [1] "halfse"
##
## $thresholdMethod
## [1] "PredPrev=Obs"
# set to true if want to run for a limited number of rows (i.e. for code testing)
test_run <- params$test_run
save_figs <- params$save_figs
response <- params$response
fit_sample <- TRUE # fit model to a sample of the data
n_train <- 5e4 # sample size of the training data
n_test <- 1e6 # sample size of the testing data (if this is too big the decile dotplot code throws memory errors)
run <- params$run
whichSecondBestMod <- params$whichSecondBestMod
treeThreshold <- params$treeThreshold
thresholdMethod <- params$thresholdMethod
Load packages and source functions
# set option so resampled dataset created here reproduces earlier runs of this code with dplyr 1.0.10
source("../../../Functions/glmTransformsIterates.R")
source("../../../Functions/transformPreds.R")
source("../../../Functions/betaLASSO.R")
#source("../../../Functions/StepBeta_mine.R")
#source("src/fig_params.R")
#source("src/modeling_functions.R")
library(betareg)
library(ggspatial)
library(terra)
library(tidyterra)
library(sf)
library(caret)
library(tidyverse)
library(GGally) # for ggpairs()
library(pdp) # for partial dependence plots
library(gridExtra)
library(knitr)
library(patchwork) # for figure insets etc.
library(ggtext)
library(StepBeta)
theme_set(theme_classic())
library(here)
library(rsample)
library(kableExtra)
library(glmnet)
library(USA.state.boundaries)
library(cvms)
library(rsvg)
library(ggimage)
library(doMC)
library(parallel)
library(pROC)
Data compiled in the prepDataForModels.R script
here::i_am("Analysis/VegComposition/ModelFitting/02_ModelFitting_globalTreeModel.Rmd")
modDat <- readRDS( here("Data_processed", "CoverData", "DataForModels_spatiallyAveraged_withSoils_noSf_sampledLANDFIRE.rds")) %>% st_drop_geometry()
We will fit a binomial model that predicts whether or not there are trees at a location. Because the tree cover data we have is continuous between 0 and 100, we convert it to be binomial be forcing any values ≤ % to be 0, and any values > % to be 1.
modDat <- modDat %>%
mutate(TotalTreeCover_binom = replace(TotalTreeCover, TotalTreeCover <=treeThreshold, 0)) %>%
mutate(TotalTreeCover_binom = replace(TotalTreeCover_binom, TotalTreeCover_binom > treeThreshold, 1))
set.seed(1234)
# now, rename columns for brevity
modDat_1 <- modDat %>%
dplyr::select(-c(prcp_annTotal:annVPD_min)) %>%
# mutate(Lon = st_coordinates(.)[,1],
# Lat = st_coordinates(.)[,2]) %>%
# st_drop_geometry() %>%
# filter(!is.na(newRegion))
rename("tmin" = tmin_meanAnnAvg_CLIM,
"tmax" = tmax_meanAnnAvg_CLIM, #1
"tmean" = tmean_meanAnnAvg_CLIM,
"prcp" = prcp_meanAnnTotal_CLIM,
"t_warm" = T_warmestMonth_meanAnnAvg_CLIM,
"t_cold" = T_coldestMonth_meanAnnAvg_CLIM,
"prcp_wet" = precip_wettestMonth_meanAnnAvg_CLIM,
"prcp_dry" = precip_driestMonth_meanAnnAvg_CLIM,
"prcp_seasonality" = precip_Seasonality_meanAnnAvg_CLIM, #2
"prcpTempCorr" = PrecipTempCorr_meanAnnAvg_CLIM, #3
"abvFreezingMonth" = aboveFreezing_month_meanAnnAvg_CLIM,
"isothermality" = isothermality_meanAnnAvg_CLIM, #4
"annWatDef" = annWaterDeficit_meanAnnAvg_CLIM,
"annWetDegDays" = annWetDegDays_meanAnnAvg_CLIM,
"VPD_mean" = annVPD_mean_meanAnnAvg_CLIM,
"VPD_max" = annVPD_max_meanAnnAvg_CLIM, #5
"VPD_min" = annVPD_min_meanAnnAvg_CLIM, #6
"VPD_max_95" = annVPD_max_95percentile_CLIM,
"annWatDef_95" = annWaterDeficit_95percentile_CLIM,
"annWetDegDays_5" = annWetDegDays_5percentile_CLIM,
"frostFreeDays_5" = durationFrostFreeDays_5percentile_CLIM,
"frostFreeDays" = durationFrostFreeDays_meanAnnAvg_CLIM,
"soilDepth" = soilDepth, #7
"clay" = surfaceClay_perc,
"sand" = avgSandPerc_acrossDepth, #8
"coarse" = avgCoarsePerc_acrossDepth, #9
"carbon" = avgOrganicCarbonPerc_0_3cm, #10
"AWHC" = totalAvailableWaterHoldingCapacity,
## anomaly variables
tmean_anom = tmean_meanAnnAvg_3yrAnom, #15
tmin_anom = tmin_meanAnnAvg_3yrAnom, #16
tmax_anom = tmax_meanAnnAvg_3yrAnom, #17
prcp_anom = prcp_meanAnnTotal_3yrAnom, #18
t_warm_anom = T_warmestMonth_meanAnnAvg_3yrAnom, #19
t_cold_anom = T_coldestMonth_meanAnnAvg_3yrAnom, #20
prcp_wet_anom = precip_wettestMonth_meanAnnAvg_3yrAnom, #21
precp_dry_anom = precip_driestMonth_meanAnnAvg_3yrAnom, #22
prcp_seasonality_anom = precip_Seasonality_meanAnnAvg_3yrAnom, #23
prcpTempCorr_anom = PrecipTempCorr_meanAnnAvg_3yrAnom, #24
aboveFreezingMonth_anom = aboveFreezing_month_meanAnnAvg_3yrAnom, #25
isothermality_anom = isothermality_meanAnnAvg_3yrAnom, #26
annWatDef_anom = annWaterDeficit_meanAnnAvg_3yrAnom, #27
annWetDegDays_anom = annWetDegDays_meanAnnAvg_3yrAnom, #28
VPD_mean_anom = annVPD_mean_meanAnnAvg_3yrAnom, #29
VPD_min_anom = annVPD_min_meanAnnAvg_3yrAnom, #30
VPD_max_anom = annVPD_max_meanAnnAvg_3yrAnom, #31
VPD_max_95_anom = annVPD_max_95percentile_3yrAnom, #32
annWatDef_95_anom = annWaterDeficit_95percentile_3yrAnom, #33
annWetDegDays_5_anom = annWetDegDays_5percentile_3yrAnom , #34
frostFreeDays_5_anom = durationFrostFreeDays_5percentile_3yrAnom, #35
frostFreeDays_anom = durationFrostFreeDays_meanAnnAvg_3yrAnom #36
) %>%
dplyr::select(-c(tmin_meanAnnAvg_3yr:durationFrostFreeDays_meanAnnAvg_3yr))
The following are the candidate predictor variables for this ecoregion:
prednames <- c(
"tmean",
"prcp",
"prcp_dry",
"prcp_seasonality",
"prcpTempCorr",
"isothermality",
"annWetDegDays",
"annWatDef_95",
"clay",
"coarse",
"AWHC"
#"tmean", "prcp", "prcp_seasonality", "prcpTempCorr", "isothermality", "annWetDegDays", "sand", "coarse", "AWHC"
)
print(prednames)
## [1] "tmean" "prcp" "prcp_dry" "prcp_seasonality"
## [5] "prcpTempCorr" "isothermality" "annWetDegDays" "annWatDef_95"
## [9] "clay" "coarse" "AWHC"
allPreds <- modDat_1 %>%
dplyr::select(tmin:frostFreeDays,tmean_anom:frostFreeDays_anom, soilDepth:AWHC) %>%
names()
modDat_1_s_temp <- modDat_1 %>%
mutate(across(all_of(allPreds), base::scale, .names = "{.col}_s"))
saveRDS(modDat_1_s_temp, file = "./models/scaledModelInputData_yesNoTreeMod.rds")
# Remove the rows that have no observations for total tree cover
modDat_1_s <- modDat_1_s_temp[!is.na(modDat_1_s_temp[,"TotalTreeCover_binom"]),]
# subset the data to only include these predictors, and remove any remaining NAs
modDat_1_s <- modDat_1_s %>%
dplyr::select(prednames, paste0(prednames, "_s"), TotalTreeCover, TotalTreeCover_binom, newRegion, Year, x, y, NA_L1NAME, NA_L2NAME) %>%
drop_na()
names(prednames) <- prednames
df_pred <- modDat_1_s[, prednames]
response <- "TotalTreeCover"
ggplot(modDat_1_s) +
geom_histogram(aes(TotalTreeCover/100), fill = "forestgreen", col = "forestgreen", alpha = .5) +
xlab("Tree Cover") +
ggtitle("Untransformed, observed tree cover")
ggplot(modDat_1_s) +
geom_histogram(aes(TotalTreeCover_binom), fill = "purple", alpha = .5, col = "purple") +
xlab("Tree Cover") +
ggtitle(paste0("Tree cover, converted to binomial with a ",treeThreshold,"cutoff"))
create_summary <- function(df) {
df %>%
pivot_longer(cols = everything(),
names_to = 'variable') %>%
group_by(variable) %>%
summarise(across(value, .fns = list(mean = ~mean(.x, na.rm = TRUE), min = ~min(.x, na.rm = TRUE),
median = ~median(.x, na.rm = TRUE), max = ~max(.x, na.rm = TRUE)))) %>%
mutate(across(where(is.numeric), round, 4))
}
modDat_1_s[prednames] %>%
create_summary() %>%
knitr::kable(caption = 'summaries of possible predictor variables') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| variable | value_mean | value_min | value_median | value_max |
|---|---|---|---|---|
| AWHC | 14.8606 | 0.1424 | 14.2535 | 35.2881 |
| annWatDef_95 | 164.3643 | 0.0000 | 135.6146 | 601.2686 |
| annWetDegDays | 1854.7754 | 81.2108 | 1608.8952 | 7131.5166 |
| clay | 19.7378 | 0.1687 | 18.6720 | 83.0975 |
| coarse | 9.3788 | 0.0000 | 5.4986 | 79.9649 |
| isothermality | 37.6851 | 19.4935 | 37.7091 | 63.7425 |
| prcp | 483.0393 | 47.6797 | 399.3619 | 4360.3490 |
| prcpTempCorr | 0.0474 | -0.8613 | 0.1239 | 0.7098 |
| prcp_dry | 3.2466 | 0.0000 | 1.6926 | 81.4306 |
| prcp_seasonality | 0.9725 | 0.3568 | 0.9379 | 2.2319 |
| tmean | 11.1504 | -2.2524 | 10.0136 | 24.6330 |
scaleFigDat_1 <- modDat_1_s %>%
dplyr::select(c(x, y, Year, prednames)) %>%
pivot_longer(cols = all_of(names(prednames)),
names_to = "predNames",
values_to = "predValues_unScaled")
scaleFigDat_2 <- modDat_1_s %>%
dplyr::select(c(x, y, Year, paste0(prednames, "_s"))) %>%
pivot_longer(cols = all_of(paste0(prednames,"_s"
)),
names_to = "predNames",
values_to = "predValues_scaled",
names_sep = ) %>%
mutate(predNames = str_replace(predNames, pattern = "_s$", replacement = ""))
scaleFigDat_3 <- scaleFigDat_1 %>%
left_join(scaleFigDat_2)
ggplot(scaleFigDat_3) +
facet_wrap(~predNames, scales = "free") +
geom_histogram(aes(predValues_unScaled), fill = "lightgrey", col = "darkgrey") +
geom_histogram(aes(predValues_scaled), fill = "lightblue", col = "blue") +
xlab ("predictor variable values") +
ggtitle("Comparing the distribution of unscaled (grey) to scaled (blue) predictor variables")
modDat_1_s <- modDat_1_s %>%
rename_with(~paste0(.x, "_raw"),
any_of(names(prednames))) %>%
rename_with(~str_remove(.x, "_s$"),
any_of(paste0(names(prednames), "_s")))
set.seed(12011993)
# vector of name of response variables
vars_response <- response
# longformat dataframes for making boxplots
df_sample_plots <- modDat_1_s %>%
slice_sample(n = 5e4) %>%
rename(response = all_of("TotalTreeCover_binom")) %>%
mutate(response = case_when(
response == 0 ~ "0",
response > 0 ~ "1",
)) %>%
dplyr::select(c(response, prednames)) %>%
tidyr::pivot_longer(cols = unname(prednames),
names_to = "predictor",
values_to = "value"
)
ggplot(df_sample_plots, aes_string(x= "response", y = 'value')) +
geom_boxplot() +
facet_wrap(~predictor , scales = 'free_y') +
ylab("Predictor Variable Values") +
xlab(response)
# # # do a pca of climate across years
# pca_yrs <- prcomp(modDat_1_s[,paste0(prednames)])
# # library(factoextra)
# (fviz_pca_ind(pca_yrs, habillage = modDat_1_s$Year, label = "none", addEllipses = TRUE, ellipse.level = .95, ggtheme = theme_minimal(), alpha.ind = .1))
#
# ggplot(modDat_1_s) +
# geom_point(aes(modDat_1_s$Year, modDat_1_s$TotalTreeCover)) +
# geom_smooth(aes(modDat_1_s$Year, modDat_1_s$TotalTreeCover))
# save test set
modDat_1_sTest <- modDat_1_s[modDat_1_s$Year %in% c(2003, 2012, 2019),]
modDat_1_s <- modDat_1_s[!modDat_1_s$Year %in% c(2003, 2012, 2019),]
First, if there are observations in Louisiana, sub-sample them so they’re not so over-represented in the dataset
## make data into spatial format
modDat_1_sf <- modDat_1_s %>%
st_as_sf(coords = c("x", "y"), crs = st_crs("EPSG:4326"))
# download map info for visualization
data(state_boundaries_wgs84)
cropped_states <- suppressMessages(state_boundaries_wgs84 %>%
dplyr::filter(NAME!="Hawaii") %>%
dplyr::filter(NAME!="Alaska") %>%
dplyr::filter(NAME!="Puerto Rico") %>%
dplyr::filter(NAME!="American Samoa") %>%
dplyr::filter(NAME!="Guam") %>%
dplyr::filter(NAME!="Commonwealth of the Northern Mariana Islands") %>%
dplyr::filter(NAME!="United States Virgin Islands") %>%
sf::st_sf() %>%
sf::st_transform(sf::st_crs(modDat_1_sf)))
## do a pca of climate across level 2 ecoregions
pca <- prcomp(modDat_1_s[,paste0(prednames)])
library(factoextra)
(fviz_pca_ind(pca, habillage = modDat_1_s$NA_L2NAME, label = "none", addEllipses = TRUE, ellipse.level = .95, ggtheme = theme_minimal(), alpha.ind = .1))
# make a table of n for each region
modDat_1_s %>%
group_by(NA_L2NAME) %>%
dplyr::summarize("Number_Of_Observations" = length(NA_L2NAME)) %>%
rename("Level_2_Ecoregion" = NA_L2NAME)%>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| Level_2_Ecoregion | Number_Of_Observations |
|---|---|
| ATLANTIC HIGHLANDS | 827 |
| CENTRAL USA PLAINS | 71 |
| COLD DESERTS | 144290 |
| EVERGLADES | 2 |
| MARINE WEST COAST FOREST | 2266 |
| MEDITERRANEAN CALIFORNIA | 11356 |
| MISSISSIPPI ALLUVIAL AND SOUTHEAST USA COASTAL PLAINS | 1445 |
| MIXED WOOD PLAINS | 1179 |
| MIXED WOOD SHIELD | 1014 |
| OZARK/OUACHITA-APPALACHIAN FORESTS | 1993 |
| SOUTH CENTRAL SEMIARID PRAIRIES | 96157 |
| SOUTHEASTERN USA PLAINS | 3816 |
| TAMAULIPAS-TEXAS SEMIARID PLAIN | 6476 |
| TEMPERATE PRAIRIES | 12890 |
| TEXAS-LOUISIANA COASTAL PLAIN | 3888 |
| UPPER GILA MOUNTAINS | 6949 |
| WARM DESERTS | 58351 |
| WEST-CENTRAL SEMIARID PRAIRIES | 75472 |
| WESTERN CORDILLERA | 35546 |
| WESTERN SIERRA MADRE PIEDMONT | 5975 |
map1 <- ggplot() +
geom_sf(data=cropped_states,fill='white') +
geom_sf(data=modDat_1_sf#[modDat_1_sf$NA_L2NAME %in% c("MIXED WOOD PLAINS"),]
,
aes(fill=as.factor(NA_L2NAME)),linewidth=0.5,alpha=0.5) +
geom_point(data=modDat_1_s#[modDat_1_s$NA_L2NAME %in% c("MIXED WOOD PLAINS"),]
,
alpha=0.5,
aes(x = x, y = y, color=as.factor(NA_L2NAME)), alpha = .1) +
#scale_fill_okabeito() +
#scale_color_okabeito() +
# theme_default() +
theme(legend.position = 'none') +
labs(title = "Level 2 Ecoregions as spatial blocks")
hull <- modDat_1_sf %>%
ungroup() %>%
group_by(NA_L2NAME) %>%
slice(chull(tmean, prcp))
plot1<-ggplot(data=modDat_1_sf,aes(x=tmean,y=prcp)) +
geom_polygon(data = hull, alpha = 0.25,aes(fill=NA_L2NAME) )+
geom_point(aes(group=NA_L2NAME,color=NA_L2NAME),alpha=0.25) +
theme_minimal() + xlab("Annual Average T_mean - long-term average") +
ylab("Annual Average Precip - long-term average") #+
#scale_color_okabeito() +
#scale_fill_okabeito()
plot2<-ggplot(data=modDat_1_sf %>%
pivot_longer(cols=tmean:prcp),
aes(x=value,group=name)) +
# geom_polygon(data = hull, alpha = 0.25,aes(fill=fold) )+
geom_density(aes(group=NA_L2NAME,fill=NA_L2NAME),alpha=0.25) +
theme_minimal() +
facet_wrap(~name,scales='free')# +
#scale_color_okabeito() +
#scale_fill_okabeito()
library(patchwork)
(combo <- (map1+plot1)/plot2)
#
# ggplot(data = modDat_1_s) +
# geom_density(aes(ShrubCover_transformed, col = NA_L2NAME)) +
# xlim(c(0,100))
if (run == TRUE) {
# set up custom folds
# get the ecoregions for training lambda
train_eco <- modDat_1_s$NA_L2NAME#[train]
# Fit model -----------------------------------------------
# specify leave-one-year-out cross-validation
my_folds <- as.numeric(as.factor(train_eco))
# set up parallel processing
# this computer has 16 cores (parallel::detectCores())
registerDoMC(cores = 8)
fit <- cv.glmnet(
x = X[,2:ncol(X)],
y = y,
family = "binomial",
keep = FALSE,
alpha = 1, # 0 == ridge regression, 1 == lasso, 0.5 ~~ elastic net
lambda = lambdas,
relax = ifelse(response == "ShrubCover", yes = TRUE, no = FALSE),
#nlambda = 100,
type.measure="mse",
#penalty.factor = pen_facts,
foldid = my_folds,
#thresh = thresh,
standardize = FALSE, ## scales variables prior to the model sequence... coefficients are always returned on the original scale
parallel = TRUE
)
base::saveRDS(fit, paste0("../ModelFitting/models/yesOrNoTrees_globalLASSOmod_binomial_treeCutoff_",treeThreshold,".rds"))
best_lambda <- fit$lambda.min
# save the lambda for the most regularized model w/ an MSE that is still 1SE w/in the best lambda model
lambda_1SE <- fit$lambda.1se
# save the lambda for the most regularized model w/ an MSE that is still .5SE w/in the best lambda model
lambda_halfSE <- best_lambda + ((lambda_1SE - best_lambda)/2)
## Now, we need to do stability selection to ensure the coefficients that are being chosen with each lambda are stable
## stability selection for best lambda model
# setup params
p <- ncol(X[,2:ncol(X)]) # of parameters
n <- length(y) # of observations
n_iter <- 100 # number of subsamples
sample_frac <- 0.75 # fraction of data to subsample
lambda_val <- best_lambda # fixed lambda value (could be chosen via CV)
# Track selection
selection_counts_bestL <- matrix(0, nrow = p, ncol = 1)
for (i in 1:n_iter) {
# Subsample rows
sample_idx <- sample(1:n, size = floor(sample_frac * n), replace = FALSE)
X_sub <- X[sample_idx, ]
y_sub <- y[sample_idx]
# Fit Lasso model
fit_stab_i <- glmnet(x = X_sub[,2:ncol(X_sub)], y = y_sub,
family = "binomial",
alpha = 1, lambda = lambda_val, standardize = FALSE)
# Get non-zero coefficients (excluding intercept)
select_bestL <- which(as.vector(coef(fit_stab_i))[-1] != 0)
selection_counts_bestL[select_bestL] <- selection_counts_bestL[select_bestL] + 1
}
# Convert counts to selection probabilities (the probability that a variable is selected over 100 iterations)
selection_prob_bestL <- selection_counts_bestL / n_iter
selection_prob_bestL_df <- data.frame(
VariableNumber = paste0("X", 1:p),
VariableName = rownames(coef(fit_stab_i))[2:(p+1)],
SelectionProb = as.vector(selection_prob_bestL)
)
# get those variables that are selected in ≥70% of subsets (Meinshausen and Bühlmann, 2010)
bestLambda_coef <- selection_prob_bestL_df[selection_prob_bestL_df$SelectionProb>=.7, c("VariableName", "SelectionProb")]
#//////
# stability selection for 1se lambda model
lambda_val <- lambda_1SE # fixed lambda value (could be chosen via CV)
# Track selection
selection_counts_1seL <- matrix(0, nrow = p, ncol = 1)
for (i in 1:n_iter) {
# Subsample rows
sample_idx <- sample(1:n, size = floor(sample_frac * n), replace = FALSE)
X_sub <- X[sample_idx, ]
y_sub <- y[sample_idx]
# Fit Lasso model
fit_stab_i <- glmnet(x = X_sub[,2:ncol(X_sub)], y = y_sub,
family = "binomial",
alpha = 1, lambda = lambda_val, standardize = FALSE)
# Get non-zero coefficients (excluding intercept)
selected_1seL <- which(as.vector(coef(fit_stab_i))[-1] != 0)
selection_counts_1seL[selected_1seL] <- selection_counts_1seL[selected_1seL] + 1
}
# Convert counts to selection probabilities (the probability that a variable is selected over 100 iterations)
selection_prob_1seL <- selection_counts_1seL / n_iter
selection_prob_1seL_df <- data.frame(
VariableNumber = paste0("X", 1:p),
VariableName = rownames(coef(fit_stab_i))[2:(p+1)],
SelectionProb = as.vector(selection_prob_1seL)
)
# get those variables that are selected in ≥70% of subsets (Meinshausen and Bühlmann, 2010)
seLambda_coef <- selection_prob_1seL_df[selection_prob_1seL_df$SelectionProb>=.7, c("VariableName", "SelectionProb")]
# stability selection for half se lambda model
lambda_val <- lambda_halfSE # fixed lambda value (could be chosen via CV)
# Track selection
selection_counts_halfseL <- matrix(0, nrow = p, ncol = 1)
for (i in 1:n_iter) {
# Subsample rows
sample_idx <- sample(1:n, size = floor(sample_frac * n), replace = FALSE)
X_sub <- X[sample_idx, ]
y_sub <- y[sample_idx]
# Fit Lasso model
fit_stab_i <- glmnet(x = X_sub[,2:ncol(X_sub)], y = y_sub,
family = "binomial",
alpha = 1, lambda = lambda_val, standardize = FALSE)
# Get non-zero coefficients (excluding intercept)
selected_halfseL <- which(as.vector(coef(fit_stab_i))[-1] != 0)
selection_counts_halfseL[selected_halfseL] <- selection_counts_halfseL[selected_halfseL] + 1
}
# Convert counts to selection probabilities (the probability that a variable is selected_halfseL over 100 iterations)
selection_prob_halfseL <- selection_counts_halfseL / n_iter
selection_prob_halfseL_df <- data.frame(
VariableNumber = paste0("X", 1:p),
VariableName = rownames(coef(fit_stab_i))[2:(p+1)],
SelectionProb = as.vector(selection_prob_halfseL)
)
# get those variables that are selected_halfseL_halfseL in ≥70% of subsets (Meinshausen and Bühlmann, 2010)
halfseLambda_coef <- selection_prob_halfseL_df[selection_prob_halfseL_df$SelectionProb>=.7, c("VariableName", "SelectionProb")]
## fit w/ the identified coefficients from the 'best' lambda, but using the glm function
mat_glmnet_best <- bestLambda_coef$VariableName
if (length(mat_glmnet_best) == 0) {
f_glm_bestLambda <- as.formula(paste0("TotalTreeCover_binom", "~ 1"))
} else {
f_glm_bestLambda <- as.formula(paste0("TotalTreeCover_binom", " ~ ", paste0(mat_glmnet_best, collapse = " + ")))
}
## fit using betareg
fit_glm_bestLambda <- fit_glm_bestLambda_binomial <- glm(formula = f_glm_bestLambda, data = modDat_1_s, family = binomial)
## fit w/ the identified coefficients from the '1se' lambda, but using the glm function
mat_glmnet_1se <- seLambda_coef$VariableName
if (length(mat_glmnet_1se) == 0) {
f_glm_1se <- as.formula(paste0("TotalTreeCover_binom", "~ 1"))
} else {
f_glm_1se <- as.formula(paste0("TotalTreeCover_binom", " ~ ", paste0(mat_glmnet_1se, collapse = " + ")))
}
fit_glm_se <- glm(formula = f_glm_1se, data = modDat_1_s, family = binomial)
# glm(data = modDat_1_s, formula = f_glm_1se,
# family = stats::Gamma(link = "log"))
## fit w/ the identified coefficients from the '.5se' lambda, but using the glm function
mat_glmnet_halfse <- halfseLambda_coef$VariableName
if (length(mat_glmnet_halfse) == 0) {
f_glm_halfse <- as.formula(paste0("TotalTreeCover_binom", "~ 1"))
} else {
f_glm_halfse <- as.formula(paste0("TotalTreeCover_binom", " ~ ", paste0(mat_glmnet_halfse, collapse = " + ")))
}
fit_glm_halfse <- glm(formula = f_glm_halfse, data = modDat_1_s, family = binomial )
## save models
saveRDS(fit_glm_bestLambda, paste0("./models/yesOrNoTrees_bestLambdaGLM_",treeThreshold,".rds"))
saveRDS(fit_glm_halfse, paste0("./models/yesOrNoTrees_halfSELambdaGLM_",treeThreshold,".rds"))
saveRDS(fit_glm_se, paste0("./models/yesOrNoTrees_oneSELambdaGLM_",treeThreshold,".rds"))
## save the R environment after running the models
save(f_glm_halfse, mat_glmnet_halfse, halfseLambda_coef,
f_glm_1se, mat_glmnet_1se, seLambda_coef,
f_glm_bestLambda, mat_glmnet_best, bestLambda_coef,
file = paste0("./models/interimModelFittingObjects_yesOrNoTrees_binomial_",treeThreshold,".rds"))
} else {
# read in LASSO object
fit <- readRDS(paste0("../ModelFitting/models/yesOrNoTrees_globalLASSOmod_binomial_treeCutoff_",treeThreshold,".rds"))
# read in R objects having to do w/ model fitting
load(file = paste0("./models/interimModelFittingObjects_yesOrNoTrees_binomial_",treeThreshold,".rds"))
fit_glm_bestLambda <- readRDS(paste0("./models/yesOrNoTrees_bestLambdaGLM_",treeThreshold,".rds"))
fit_glm_halfse <- readRDS(paste0("./models/yesOrNoTrees_halfSELambdaGLM_",treeThreshold,".rds"))
fit_glm_se <- readRDS(paste0("./models/yesOrNoTrees_oneSELambdaGLM_",treeThreshold,".rds"))
}
# assess model fit
# assess.glmnet(fit$fit.preval, #newx = X[,2:293],
# newy = y, family = stats::Gamma(link = "log"))
# save the minimum lambda
best_lambda <- fit$lambda.min
# save the lambda for the most regularized model w/ an MSE that is still 1SE w/in the best lambda model
lambda_1SE <- fit$lambda.1se
# save the lambda for the most regularized model w/ an MSE that is still .5SE w/in the best lambda model
lambda_halfSE <- best_lambda + ((lambda_1SE - best_lambda)/2)
print(fit)
##
## Call: cv.glmnet(x = X[, 2:ncol(X)], y = y, lambda = lambdas, type.measure = "mse", foldid = my_folds, keep = FALSE, parallel = TRUE, relax = ifelse(response == "ShrubCover", yes = TRUE, no = FALSE), family = "binomial", alpha = 1, standardize = FALSE)
##
## Measure: Mean-Squared Error
##
## Lambda Index Measure SE Nonzero
## min 0.00365 115 0.2447 0.03089 22
## 1se 0.04150 80 0.2736 0.02847 5
plot(fit)
## predict on the test data
# lasso model predictions with the optimal lambda
optimal_pred <- stats::predict(fit_glm_bestLambda, #newx=X[,2:ncol(X)],
type = "response")
optimal_pred_1se <- stats::predict(fit_glm_se, newx=X[,2:ncol(X)], type = "response")
optimal_pred_halfse <- stats::predict(fit_glm_halfse, newx = X[,2:ncol(X)], type = "response")
null_fit <- glm(
formula = y ~ 1, #data = modDat_1_s,
family = binomial
)
null_pred <- predict(null_fit, newdata = as.data.frame(X), type = "response"
)
# save data
fullModOut <- list(
"modelObject" = fit,
"nullModelObject" = null_fit,
"modelPredictions" = data.frame(#ecoRegion_holdout = rep(test_eco,length(y)),
obs=y,
pred_opt=optimal_pred,
pred_opt_se = optimal_pred_1se,
pred_opt_halfse = optimal_pred_halfse,
pred_null=null_pred#,
#pred_nopenalty=nopen_pred
))
ggplot() +
geom_point(aes(X[,2], fullModOut$modelPredictions$obs), col = "black", alpha = .1) +
geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt), col = "red", alpha = .1) + ## predictions w/ the CV model (best lambda)
geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt_halfse), col = "orange", alpha = .1) + ## predictions w/ the CV model (.5se lambda)
geom_point(aes(X[,2], fullModOut$modelPredictions$pred_opt_se), col = "green", alpha = .1) + ## predictions w/ the CV model (1se lambda)
geom_point(aes(X[,2], fullModOut$modelPredictions$pred_null), col = "blue", alpha = .1) +
labs(title = "A rough comparison of observed and model-predicted values",
subtitle = "black = observed values \n red = predictions from 'best lambda' model \n orange = predictions for '1/2se' lambda model \n green = predictions from '1se' lambda model \n blue = predictions from null model") +
xlab(colnames(X)[2])
#ylim(c(0,200))
r{print(best_lambda)}. The lambda value such that the cross
validation error is within 1 standard error of the minimum (“1se
lambda”) was `r{print(fit$lambda.1se)}`` . The following coefficients
were kept in each model:# the coefficient matrix from the 'best model' -- find and print those coefficients that aren't 0 in a table
coef_glm_bestLambda <- coef(fit_glm_bestLambda) %>%
data.frame()
coef_glm_bestLambda$coefficientName <- rownames(coef_glm_bestLambda)
names(coef_glm_bestLambda)[1] <- "coefficientValue_bestLambda"
# coefficient matrix from the '1se' model
coef_glm_1se <- coef(fit_glm_se) %>%
data.frame()
coef_glm_1se$coefficientName <- rownames(coef_glm_1se)
names(coef_glm_1se)[1] <- "coefficientValue_1seLambda"
# coefficient matrix from the 'half se' model
coef_glm_halfse <- coef(fit_glm_halfse) %>%
data.frame()
coef_glm_halfse$coefficientName <- rownames(coef_glm_halfse)
names(coef_glm_halfse)[1] <- "coefficientValue_halfseLambda"
# add together
coefs <- full_join(coef_glm_bestLambda, coef_glm_halfse) %>%
full_join(coef_glm_1se) %>%
dplyr::select(coefficientName, coefficientValue_bestLambda,
coefficientValue_halfseLambda, coefficientValue_1seLambda)
globModTerms <- coefs[!is.na(coefs$coefficientValue_bestLambda), "coefficientName"]
## also, get the number of unique variables in each model
var_prop_pred <- paste0(response, "_pred")
response_vars <- c(response, var_prop_pred)
# for best lambda model
prednames_fig <- paste(str_split(globModTerms, ":", simplify = TRUE))
prednames_fig <- str_replace(prednames_fig, "I\\(", "")
prednames_fig <- str_replace(prednames_fig, "\\^2\\)", "")
prednames_fig <- unique(prednames_fig[prednames_fig>0])
prednames_fig <- prednames_fig
prednames_fig_num <- length(prednames_fig)
# for 1SE lambda model
globModTerms_1se <- coefs[!is.na(coefs$coefficientValue_1seLambda), "coefficientName"]
if (length(globModTerms_1se) == 1) {
prednames_fig_1se <- paste(str_split(globModTerms_1se, ":", simplify = TRUE))
prednames_fig_1se <- str_replace(prednames_fig_1se, "I\\(", "")
prednames_fig_1se <- str_replace(prednames_fig_1se, "\\^2\\)", "")
prednames_fig_1se <- unique(prednames_fig_1se[prednames_fig_1se>0])
prednames_fig_1se_num <- c(0)
} else {
prednames_fig_1se <- paste(str_split(globModTerms_1se, ":", simplify = TRUE))
prednames_fig_1se <- str_replace(prednames_fig_1se, "I\\(", "")
prednames_fig_1se <- str_replace(prednames_fig_1se, "\\^2\\)", "")
prednames_fig_1se <- unique(prednames_fig_1se[prednames_fig_1se>0])
prednames_fig_1se_num <- length(prednames_fig_1se)
}
# for 1/2SE lambda model
globModTerms_halfse <- coefs[!is.na(coefs$coefficientValue_halfseLambda), "coefficientName"]
if (length(globModTerms_halfse) == 1) {
prednames_fig_halfse <- paste(str_split(globModTerms_halfse, ":", simplify = TRUE))
prednames_fig_halfse <- str_replace(prednames_fig_halfse, "I\\(", "")
prednames_fig_halfse <- str_replace(prednames_fig_halfse, "\\^2\\)", "")
prednames_fig_halfse <- unique(prednames_fig_halfse[prednames_fig_halfse>0])
prednames_fig_halfse_num <- c(0)
} else {
prednames_fig_halfse <- paste(str_split(globModTerms_halfse, ":", simplify = TRUE))
prednames_fig_halfse <- str_replace(prednames_fig_halfse, "I\\(", "")
prednames_fig_halfse <- str_replace(prednames_fig_halfse, "\\^2\\)", "")
prednames_fig_halfse <- unique(prednames_fig_halfse[prednames_fig_halfse>0])
prednames_fig_halfse_num <- length(prednames_fig_halfse)
}
# make a table
kable(coefs, col.names = c("Coefficient Name", "Value from best lambda model",
"Value from 1/2 se lambda", "Value from 1se lambda model")
) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| Coefficient Name | Value from best lambda model | Value from 1/2 se lambda | Value from 1se lambda model |
|---|---|---|---|
| (Intercept) | -0.6747817 | -0.8616112 | -1.3908996 |
| prcp | 1.7134506 | 1.9694935 | 1.8591155 |
| prcp_seasonality | -0.0163393 | -0.3086721 | -0.5178721 |
| prcpTempCorr | -0.9128524 | -0.5967287 | -0.2352708 |
| annWetDegDays | 0.5337744 | NA | NA |
| annWatDef_95 | -0.6052394 | NA | NA |
| coarse | 0.2057888 | NA | NA |
| AWHC | -0.6156823 | -0.7655963 | -0.8858988 |
| I(tmean^2) | -0.3181863 | NA | NA |
| I(prcp_seasonality^2) | 0.2010413 | NA | NA |
| I(prcpTempCorr^2) | -0.8905565 | -0.6416890 | NA |
| I(isothermality^2) | 0.1422692 | NA | NA |
| I(clay^2) | -0.1181021 | -0.1661580 | -0.1421531 |
| I(AWHC^2) | 0.1127803 | NA | NA |
| annWetDegDays:annWatDef_95 | 0.9165646 | NA | NA |
| prcp_dry:isothermality | -0.2965084 | -0.4376182 | NA |
| prcpTempCorr:isothermality | -0.0206105 | NA | NA |
| isothermality:tmean | -0.2850425 | NA | NA |
| prcpTempCorr:prcp_dry | 0.5054909 | NA | NA |
| prcpTempCorr:tmean | -0.2528563 | NA | NA |
| AWHC:clay | -0.1629028 | -0.2506869 | NA |
| coarse:clay | 0.0916325 | NA | NA |
# calculate RMSE of all models
RMSE_best <- yardstick::rmse(fullModOut$modelPredictions[,c("obs", "pred_opt")], truth = "obs", estimate = "pred_opt")$.estimate
RMSE_halfse <- yardstick::rmse(fullModOut$modelPredictions[,c("obs", "pred_opt_halfse")], truth = "obs", estimate = "pred_opt_halfse")$.estimate
RMSE_1se <- yardstick::rmse(fullModOut$modelPredictions[,c("obs", "pred_opt_se")], truth = "obs", estimate = "pred_opt_se")$.estimate
# calculate bias of all models
bias_best <- mean((fullModOut$modelPredictions$obs) - fullModOut$modelPredictions$pred_opt)
bias_halfse <- mean((fullModOut$modelPredictions$obs) - fullModOut$modelPredictions$pred_opt_halfse)
bias_1se <- mean((fullModOut$modelPredictions$obs) - fullModOut$modelPredictions$pred_opt_se)
uniqueCoeffs <- data.frame("Best lambda model" = c(signif(RMSE_best,3), as.character(signif(bias_best, 3)),
as.integer(length(globModTerms)-1), as.integer(prednames_fig_num),
as.integer(sum(prednames_fig %in% c(prednames_clim))),
as.integer(sum(prednames_fig %in% c(prednames_weath))),
as.integer(sum(prednames_fig %in% c(prednames_soils)))
),
"1/2 se lambda model" = c(signif(RMSE_halfse,3), as.character(signif(bias_halfse, 3)),
length(globModTerms_halfse)-1, prednames_fig_halfse_num,
sum(prednames_fig_halfse %in% c(prednames_clim)),
sum(prednames_fig_halfse %in% c(prednames_weath)),
sum(prednames_fig_halfse %in% c(prednames_soils))),
"1se lambda model" = c(signif(RMSE_1se,3), as.character(signif(bias_1se, 3)),
length(globModTerms_1se)-1, prednames_fig_1se_num,
sum(prednames_fig_1se %in% c(prednames_clim)),
sum(prednames_fig_1se %in% c(prednames_weath)),
sum(prednames_fig_1se %in% c(prednames_soils))))
row.names(uniqueCoeffs) <- c("RMSE", "bias: mean(obs-pred.)", "Total number of coefficients", "Number of unique coefficients",
"Number of unique climate coefficients",
"Number of unique weather coefficients",
"Number of unique soils coefficients"
)
kable(uniqueCoeffs,
col.names = c("Best lambda model", "1/2 se lambda model", "1se lambda model"), row.names = TRUE) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| Best lambda model | 1/2 se lambda model | 1se lambda model | |
|---|---|---|---|
| RMSE | 0.315 | 0.323 | 0.333 |
| bias: mean(obs-pred.) | -2.79e-14 | -2.15e-13 | 8.62e-14 |
| Total number of coefficients | 21 | 8 | 5 |
| Number of unique coefficients | 11 | 7 | 5 |
| Number of unique climate coefficients | 8 | 5 | 3 |
| Number of unique weather coefficients | 0 | 0 | 0 |
| Number of unique soils coefficients | 3 | 2 | 2 |
In the following figures, we show model predictions made using the best lambda model, as well as an alternative “second-best” lambda model. As the alternative to the best lambda model, we use the model (1se or 1/2se of best Lambda) that has the fewest number of unique predictors, but is not a null model.
if (whichSecondBestMod == "auto") {
# name of model w/ fewest # of predictors (but more than 0)
uniqueCoeff_min <- min(as.numeric(uniqueCoeffs[4,2:3])[which(as.numeric(uniqueCoeffs[4,2:3]) > 0)])
alternativeModel <- names(uniqueCoeffs[4,2:3])[which(uniqueCoeffs[4,2:3] == uniqueCoeff_min)]
if (is.finite(uniqueCoeff_min)) {
if (length(alternativeModel) == 1) {
if (alternativeModel == "X1.2.se.lambda.model") {
mod_secondBest <- fit_glm_halfse
name_secondBestMod <- "1/2 SE Model"
prednames_secondBestMod <- prednames_fig_halfse
# get the name of the column that contains the prediction data for only the training data, for this model
trainDatPredName <- "pred_opt_halfse"
} else if (alternativeModel == "X1se.lambda.model") {
mod_secondBest <- fit_glm_se
name_secondBestMod <- "1 SE Model"
prednames_secondBestMod <- prednames_fig_1se
# get the name of the column that contains the prediction data for only the training data, for this model
trainDatPredName <- "pred_opt_se"
}
} else {
# if both alternative models have the same number of unique coefficients, chose the model that has the fewest number of total predictors
uniqueCoeff_min2 <- min(as.numeric(uniqueCoeffs[3,alternativeModel]))
alternativeModel2 <- names(uniqueCoeffs[3,alternativeModel])[which(uniqueCoeffs[3,alternativeModel] == uniqueCoeff_min2)]
if (length(alternativeModel2) == 1) {
if (alternativeModel2 == "X1.2.se.lambda.model") {
mod_secondBest <- fit_glm_halfse
name_secondBestMod <- "1/2 SE Model"
prednames_secondBestMod <- prednames_fig_halfse
# get the name of the column that contains the prediction data for only the training data, for this model
trainDatPredName <- "pred_opt_halfse"
} else if (alternativeModel2 == "X1se.lambda.model") {
mod_secondBest <- fit_glm_se
name_secondBestMod <- "1 SE Model"
prednames_secondBestMod <- prednames_fig_1se
# get the name of the column that contains the prediction data for only the training data, for this model
trainDatPredName <- "pred_opt_se"
}
} else {
mod_secondBest <- fit_glm_halfse
name_secondBestMod <- "1/2 SE Model"
prednames_secondBestMod <- prednames_fig_halfse
# get the name of the column that contains the prediction data for only the training data, for this model
trainDatPredName <- "pred_opt_halfse"
}
}
}else {
mod_secondBest <- NULL
name_secondBestMod <- "Intercept_Only"
prednames_secondBestMod <- NULL
}
} else if (whichSecondBestMod == "1se") {
mod_secondBest <- fit_glm_se
name_secondBestMod <- "1 SE Model"
prednames_secondBestMod <- prednames_fig_1se
# get the name of the column that contains the prediction data for only the training data, for this model
trainDatPredName <- "pred_opt_se"
} else if (whichSecondBestMod == "halfse") {
mod_secondBest <- fit_glm_halfse
name_secondBestMod <- "1/2 SE Model"
prednames_secondBestMod <- prednames_fig_halfse
# get the name of the column that contains the prediction data for only the training data, for this model
trainDatPredName <- "pred_opt_halfse"
}
# create prediction for each each model
# (i.e. for each fire proporation variable)
predict_by_response <- function(mod, df) {
df_out <- df
response_name <- paste0("TotalTreeCover_binom", "_pred")
preds <- predict(mod, newdata = df_out, #s="lambda.min",
type = "response")
#preds[preds<0] <- 0
#preds[preds>100] <- 100
df_out <- df_out %>% cbind(preds)
colnames(df_out)[ncol(df_out)] <- response_name
return(df_out)
}
## predict on the test dataset
pred_glm1_test <- predict_by_response(fit_glm_bestLambda, modDat_1_sTest)#X[,2:ncol(X)])
pred_glm1_test$resid <- pred_glm1_test[,"TotalTreeCover_binom"] - pred_glm1_test[,paste0("TotalTreeCover_binom", "_pred")]
pred_glm1_test$extremeResid <- NA
pred_glm1_test[pred_glm1_test$resid > .5 | pred_glm1_test$resid < -.5,"extremeResid"] <- 1
## predict on entire dataset
pred_glm1_full <- predict_by_response(fit_glm_bestLambda, rbind(modDat_1_sTest, modDat_1_s))#X[,2:ncol(X)])
pred_glm1_full$resid <- pred_glm1_full[,"TotalTreeCover_binom"] - pred_glm1_full[,paste0("TotalTreeCover_binom", "_pred")]
pred_glm1_full$extremeResid <- NA
pred_glm1_full[pred_glm1_full$resid > .5 | pred_glm1_full$resid < -.5,"extremeResid"] <- 1
if ( name_secondBestMod == "Intercept_Only") {
print("The next best lambda model only contains one predictor (an intercept)")
} else {
pred_glm1_1se_test <- predict_by_response(mod_secondBest, modDat_1_sTest)#X[,2:ncol(X)])
pred_glm1_1se_test$resid <- pred_glm1_1se_test[,"TotalTreeCover_binom"] - pred_glm1_1se_test[,paste0("TotalTreeCover_binom", "_pred")]
pred_glm1_1se_test$extremeResid <- NA
pred_glm1_1se_test[pred_glm1_1se_test$resid > .5 | pred_glm1_1se_test$resid < -.5,"extremeResid"] <- 1
## for full dataset
pred_glm1_1se_full <- predict_by_response(mod_secondBest, rbind(modDat_1_sTest, modDat_1_s))#X[,2:ncol(X)])
pred_glm1_1se_full$resid <- pred_glm1_1se_full[,"TotalTreeCover_binom"] - pred_glm1_1se_full[,paste0("TotalTreeCover_binom", "_pred")]
pred_glm1_1se_full$extremeResid <- NA
pred_glm1_1se_full[pred_glm1_1se_full$resid > .5 | pred_glm1_1se_full$resid < -.5,"extremeResid"] <- 1
}
Identifying the optimal threshold based on the prevalence in the training data. ROC Curves based on classification accuracy of the model when used on the testing set (held-out years 2003, 2012 and 2019)
For the bestLambda model:
library(PresenceAbsence)
DATA <- fullModOut$modelPredictions[,c("obs", "pred_opt")] # only training data
DATA$ID <- 1:nrow(DATA)
#pred.prev <- predicted.prevalence(DATA = DATA[,c("ID", "TotalTreeCover_binom", "TotalTreeCover_binom_pred")], threshold = 20)
#accu <- presence.absence.accuracy(DATA[,c("ID", "TotalTreeCover_binom", "TotalTreeCover_binom_pred")], threshold = 20)
optimalThresholds <- optimal.thresholds(DATA = DATA[,c("ID", "obs", "pred_opt")],
threshold = 200, obs.prev = mean(DATA[,"obs"])) # tests 100 thresholds)
rm(DATA)
best_threshold_bestLambda <- optimalThresholds[optimalThresholds$Method == thresholdMethod, 2]
cat(paste0("Optimal threshold for best lambda model based on ROC curve, based on the ",thresholdMethod, " method: ", best_threshold_bestLambda, "\n"))
## Optimal threshold for best lambda model based on ROC curve, based on the PredPrev=Obs method: 0.381909547738693
# Now make an ROC curve based on the test set
roc_curve <- roc(pred_glm1_test$TotalTreeCover_binom, pred_glm1_test$TotalTreeCover_binom_pred)
# Plot ROC curve
plot(roc_curve, main = paste0("ROC Curve for best lambda model \n","AUC: ",round(roc_curve$auc,2)))
For the second-bestLambda model:
# best_threshold_secondBest <- coords$threshold
DATA <- fullModOut$modelPredictions[,c("obs", trainDatPredName)]
DATA$ID <- 1:nrow(DATA)
#pred.prev <- predicted.prevalence(DATA = DATA[,c("ID", "TotalTreeCover_binom", "TotalTreeCover_binom_pred")], threshold = 20)
#accu <- presence.absence.accuracy(DATA[,c("ID", "TotalTreeCover_binom", "TotalTreeCover_binom_pred")], threshold = 20)
optimalThresholds <- optimal.thresholds(DATA = DATA[,c("ID", "obs", trainDatPredName)],
threshold = 200, obs.prev = mean(DATA[,"obs"])) # tests 100 thresholds)
rm(DATA)
best_threshold_secondBest <- optimalThresholds[optimalThresholds$Method == thresholdMethod, 2]
cat(paste0("Optimal threshold for second-best lambda model based on ROC curve, based on the ",thresholdMethod, " method: ", best_threshold_secondBest, "\n"))
## Optimal threshold for second-best lambda model based on ROC curve, based on the PredPrev=Obs method: 0.346733668341709
# Now make an ROC curve based on the test set
roc_curve_secondBest <- roc(pred_glm1_1se_test$TotalTreeCover_binom, pred_glm1_1se_test$TotalTreeCover_binom_pred, smooth = FALSE)
# Plot ROC curve
plot(roc_curve_secondBest, main = paste0("ROC Curve for second-best lambda model \n","AUC: ",round(roc_curve_secondBest$auc,2)))
## for test dataset
# best lambda model
# "binomialize" the continuous predictions
pred_glm1_test <- pred_glm1_test %>%
mutate(TotalTreeCover_binom_pred_binary = TotalTreeCover_binom_pred,
TotalTreeCover_binom_pred_binary = replace(TotalTreeCover_binom_pred_binary, TotalTreeCover_binom_pred_binary > best_threshold_bestLambda, 1),
TotalTreeCover_binom_pred_binary = replace(TotalTreeCover_binom_pred_binary, TotalTreeCover_binom_pred_binary <= best_threshold_bestLambda, 0))
# second-best lambda model
# "binomialize" the continuouse predictions
pred_glm1_1se_test <- pred_glm1_1se_test %>%
mutate(TotalTreeCover_binom_pred_binary = TotalTreeCover_binom_pred,
TotalTreeCover_binom_pred_binary = replace(TotalTreeCover_binom_pred_binary, TotalTreeCover_binom_pred_binary > best_threshold_secondBest, 1),
TotalTreeCover_binom_pred_binary = replace(TotalTreeCover_binom_pred_binary, TotalTreeCover_binom_pred_binary <= best_threshold_secondBest, 0))
## for full dataset
# best lambda model
# "binomialize" the continuous predictions
pred_glm1_full <- pred_glm1_full %>%
mutate(TotalTreeCover_binom_pred_binary = TotalTreeCover_binom_pred,
TotalTreeCover_binom_pred_binary = replace(TotalTreeCover_binom_pred_binary, TotalTreeCover_binom_pred_binary > best_threshold_bestLambda, 1),
TotalTreeCover_binom_pred_binary = replace(TotalTreeCover_binom_pred_binary, TotalTreeCover_binom_pred_binary <= best_threshold_bestLambda, 0))
# second-best lambda model
# "binomialize" the continuouse predictions
pred_glm1_1se_full <- pred_glm1_1se_full %>%
mutate(TotalTreeCover_binom_pred_binary = TotalTreeCover_binom_pred,
TotalTreeCover_binom_pred_binary = replace(TotalTreeCover_binom_pred_binary, TotalTreeCover_binom_pred_binary > best_threshold_secondBest, 1),
TotalTreeCover_binom_pred_binary = replace(TotalTreeCover_binom_pred_binary, TotalTreeCover_binom_pred_binary <= best_threshold_secondBest, 0))
# true positive = tree (1)
# false positive = model says tree, data says no-tree (2)
# true negative = no tree (3)
# false negative = model says no tree, data says tree (4)
# best lambda model - test dataset
pred_glm1_test$missClass <- NA
pred_glm1_test[pred_glm1_test$TotalTreeCover_binom == 1 &
pred_glm1_test$TotalTreeCover_binom_pred_binary == 1, "missClass"] <- 1
pred_glm1_test[pred_glm1_test$TotalTreeCover_binom == 0 &
pred_glm1_test$TotalTreeCover_binom_pred_binary == 0, "missClass"] <- 3
pred_glm1_test[pred_glm1_test$TotalTreeCover_binom == 0 &
pred_glm1_test$TotalTreeCover_binom_pred_binary == 1, "missClass"] <- 2
pred_glm1_test[pred_glm1_test$TotalTreeCover_binom == 1 &
pred_glm1_test$TotalTreeCover_binom_pred_binary == 0, "missClass"] <- 4
# best lambda model - full dataset
pred_glm1_full$missClass <- NA
pred_glm1_full[pred_glm1_full$TotalTreeCover_binom == 1 &
pred_glm1_full$TotalTreeCover_binom_pred_binary == 1, "missClass"] <- 1
pred_glm1_full[pred_glm1_full$TotalTreeCover_binom == 0 &
pred_glm1_full$TotalTreeCover_binom_pred_binary == 0, "missClass"] <- 3
pred_glm1_full[pred_glm1_full$TotalTreeCover_binom == 0 &
pred_glm1_full$TotalTreeCover_binom_pred_binary == 1, "missClass"] <- 2
pred_glm1_full[pred_glm1_full$TotalTreeCover_binom == 1 &
pred_glm1_full$TotalTreeCover_binom_pred_binary == 0, "missClass"] <- 4
# second-best lambda model - test dataset
pred_glm1_1se_test$missClass <- NA
pred_glm1_1se_test[pred_glm1_1se_test$TotalTreeCover_binom == 1 &
pred_glm1_1se_test$TotalTreeCover_binom_pred_binary == 1, "missClass"] <- 1
pred_glm1_1se_test[pred_glm1_1se_test$TotalTreeCover_binom == 0 &
pred_glm1_1se_test$TotalTreeCover_binom_pred_binary == 0, "missClass"] <- 3
pred_glm1_1se_test[pred_glm1_1se_test$TotalTreeCover_binom == 0 &
pred_glm1_1se_test$TotalTreeCover_binom_pred_binary == 1, "missClass"] <- 2
pred_glm1_1se_test[pred_glm1_1se_test$TotalTreeCover_binom == 1 &
pred_glm1_1se_test$TotalTreeCover_binom_pred_binary == 0, "missClass"] <- 4
# second-best lambda model - full dataset
pred_glm1_1se_full$missClass <- NA
pred_glm1_1se_full[pred_glm1_1se_full$TotalTreeCover_binom == 1 &
pred_glm1_1se_full$TotalTreeCover_binom_pred_binary == 1, "missClass"] <- 1
pred_glm1_1se_full[pred_glm1_1se_full$TotalTreeCover_binom == 0 &
pred_glm1_1se_full$TotalTreeCover_binom_pred_binary == 0, "missClass"] <- 3
pred_glm1_1se_full[pred_glm1_1se_full$TotalTreeCover_binom == 0 &
pred_glm1_1se_full$TotalTreeCover_binom_pred_binary == 1, "missClass"] <- 2
pred_glm1_1se_full[pred_glm1_1se_full$TotalTreeCover_binom == 1 &
pred_glm1_1se_full$TotalTreeCover_binom_pred_binary == 0, "missClass"] <- 4
# rasterize
# get reference raster
# use the gridMet raster
test_rast_dayMet <- rast("../../../Data_raw/dayMet/rawMonthlyData/orders/70e0da02b9d2d6e8faa8c97d211f3546/Daymet_Monthly_V4R1/data/daymet_v4_prcp_monttl_na_1980.tif") %>%
#terra::project(terra::crs("EPSG:4326")) %>%
terra::crop(ext(-2000000, 2600000, -1900000, 1500000))
## aggregate to a 4x4km^2 grid
test_rast <- test_rast_dayMet %>%
terra::aggregate(fact = 8, fun = "mean")
# test_rast_Untranfsormed <- rast("../../../Data_raw/gridMet/pr_2020.nc") #%>% ## mean cell size of unaggregated grid size (16709380m^2)
# test_rast <- test_rast_Untranfsormed %>%
# terra::aggregate(fact = 2, fun = "mean") #%>%
regions <- sf::st_read(dsn = "../../../Data_raw/Level2Ecoregions/", layer = "NA_CEC_Eco_Level2")
## Reading layer `NA_CEC_Eco_Level2' from data source
## `/Users/astears/Documents/Dropbox_static/Work/NAU_USGS_postdoc/cleanPED/PED_vegClimModels/Data_raw/Level2Ecoregions'
## using driver `ESRI Shapefile'
## Simple feature collection with 2261 features and 8 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: -4334052 ymin: -3313739 xmax: 3324076 ymax: 4267265
## Projected CRS: Sphere_ARC_INFO_Lambert_Azimuthal_Equal_Area
regions <- regions %>%
st_transform(crs = st_crs(crs(test_rast))) %>%
st_make_valid()
ecoregionLU <- data.frame("NA_L1NAME" = sort(unique(regions$NA_L1NAME)),
"newRegion" = c(NA, "Forest", "dryShrubGrass",
"dryShrubGrass", "Forest", "dryShrubGrass",
"dryShrubGrass", "Forest", "Forest",
"dryShrubGrass", "Forest", "Forest",
"Forest", "Forest", "dryShrubGrass",
NA
))
goodRegions <- regions %>%
left_join(ecoregionLU)
mapRegions <- goodRegions %>%
filter(!is.na(newRegion)) %>%
group_by(newRegion) %>%
summarise(geometry = sf::st_union(geometry)) %>%
ungroup() %>%
st_simplify(dTolerance = 1000) %>%
sf::st_crop(c("xmin" = -2000000, "xmax" = 2600000, "ymin" = -1900000, "ymax" = 1500000))
# make shapefile of cropped state boundaries in appropriate crs
cropped_states_2 <- cropped_states %>%
st_transform(crs = crs(test_rast)) %>%
st_make_valid() %>%
sf::st_crop(c("xmin" = -2000000, "xmax" = 2600000, "ymin" = -1900000, "ymax" = 1500000))
# function for getting the statistical mode
Mode <- function(x, ...) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
# make a multi-layer raster w/ layers for each variable of interest
pred_glm1_full_sf <- pred_glm1_full %>%
st_as_sf(coords = c("x", "y"), crs = crs("EPSG:4326")) %>%
st_transform(crs(test_rast)) %>%
terra::vect()
testRast2 <- lapply(c("TotalTreeCover", "TotalTreeCover_binom", "TotalTreeCover_binom_pred", "TotalTreeCover_binom_pred_binary", "resid", "missClass"), FUN = function(x) {
# S4 method for class 'data.frame'
if (x %in% c("TotalTreeCover", "TotalTreeCover_binom", "TotalTreeCover_binom_pred", "TotalTreeCover_binom_pred_binary", "resid")) {
temp <- terra::rasterize(x = pred_glm1_full_sf, y = test_rast,
field = x, fun = mean, na.rm = TRUE)
names(temp) <- x
} else {
temp <- terra::rasterize(x = pred_glm1_full_sf, y = test_rast,
field = x, fun = Mode, na.rm = TRUE)
# get the % aggreement between values in the dayMetx8 cell
temp2 <- terra::rasterize(x = pred_glm1_full_sf, y = test_rast$daymet_v4_prcp_monttl_na_1980_1,
field = x, fun = function(y) {
yTemp <- y[!is.na(y)]
yMode <- Mode(yTemp)
(sum(yTemp == yMode)/length(yTemp))*100
})
temp <- c(as.factor(temp$missClass), temp2)
names(temp) <- c("missClass", "PercAgreement")
}
return(temp)
}
)
pred_glm1_full_RAST <- c(testRast2[[1]], testRast2[[2]], testRast2[[3]], testRast2[[4]], testRast2[[5]], testRast2[[6]])
# ## compare
# par(mfrow = c(2,1))
# terra::plot(pred_glm1_full_RAST$TotalTreeCover, maxcell = 30000000, main = "gridMet")
# terra::plot(pred_glm1_full_RAST_dayMet$TotalTreeCover, maxcell = 30000000, main = "dayMet")
# par(mfrow = c(1,1))
##
# make figure of raw tree cover
map_obs_cont <- ggplot() +
geom_spatraster(data = pred_glm1_full_RAST, aes(fill = TotalTreeCover), maxcell = 50000000) +
#geom_sf(data = pred_glm1_full_RAST, aes(col = TotalTreeCover)) +
# stat_summary_2d(data = plotObs, aes(x = x, y = y, z = TotalTreeCover), fun = mean, binwidth = .05) +
geom_sf(data=cropped_states_2 ,fill=NA ) +
geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
labs(title = paste0("Observations of Continuous Total Tree Cover - all data")) +
scale_fill_gradient2(low = "brown",
mid = "wheat" ,
high = "forestgreen" ,
midpoint = 0, na.value = "white") +
coord_sf(datum = st_crs(pred_glm1_full_RAST)) +
# xlim(-104, -100) +
# ylim(30, 33)
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
# make figure of binomialized tree cover
map_obs <- ggplot() +
geom_spatraster(data = pred_glm1_full_RAST, aes(fill = TotalTreeCover_binom), maxcell = 50000000) +
#geom_sf(data = plotObs, aes(col = TotalTreeCover_binom)) +
#stat_summary_2d(data = plotObs, aes(x = x, y = y, z = TotalTreeCover_binom), fun = mean, binwidth = .05) +
scale_fill_viridis_c(option = "A", guide = guide_colorbar(title = "% cover"),
limits = c(0,100)) +
geom_sf(data=cropped_states_2 ,fill=NA ) +
geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
labs(title = paste0("Observations of Binomial Total Tree Cover - all data")) +
scale_fill_continuous(breaks = c(0, 0.5, 1), palette = c("wheat", "forestgreen"),
labels = c("No Trees",
"",
"Trees"), na.value = "white") +
coord_sf(datum = st_crs(pred_glm1_full_RAST)) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
hist_obs_cont <- pred_glm1_full %>%
ggplot() +
geom_histogram(aes(TotalTreeCover), fill = "lightgrey", col = "darkgrey")
hist_obs <- pred_glm1_full%>%
ggplot() +
geom_histogram(aes(TotalTreeCover_binom), fill = "lightgrey", col = "darkgrey")
library(ggpubr)
ggarrange(map_obs_cont, hist_obs_cont, map_obs, hist_obs, heights = c(3, 1, 3, 1), ncol = 1)
pred_glm1_test_sf <- pred_glm1_test %>%
st_as_sf(coords = c("x", "y"), crs = crs("EPSG:4326")) %>%
st_transform(crs(test_rast)) %>%
terra::vect()
# make raster of test set
# make a multi-layer raster w/ layers for each variable of interest
testRast_test <- lapply(c("TotalTreeCover", "TotalTreeCover_binom", "TotalTreeCover_binom_pred", "TotalTreeCover_binom_pred_binary", "resid", "missClass"), FUN = function(x) {
# S4 method for class 'data.frame'
if (x %in% c("TotalTreeCover", "TotalTreeCover_binom", "TotalTreeCover_binom_pred", "TotalTreeCover_binom_pred_binary", "resid")) {
temp <- terra::rasterize(x = pred_glm1_test_sf, y = test_rast,
field = x, fun = mean, na.rm = TRUE)
names(temp) <- x
} else {
temp <- terra::rasterize(x = pred_glm1_test_sf, y = test_rast,
field = x, fun = Mode, na.rm = TRUE)
# get the % aggreement between values in the dayMetx8 cell
temp2 <- terra::rasterize(x = pred_glm1_test_sf, y = test_rast$daymet_v4_prcp_monttl_na_1980_1,
field = x, fun = function(y) {
yTemp <- y[!is.na(y)]
yMode <- Mode(yTemp)
(sum(yTemp == yMode)/length(yTemp))*100
})
temp <- c(as.factor(temp$missClass), temp2)
names(temp) <- c("missClass", "PercAgreement")
}
return(temp)
}
)
pred_glm1_test_RAST <- c(testRast_test[[1]], testRast_test[[2]], testRast_test[[3]], testRast_test[[4]], testRast_test[[5]], testRast_test[[6]])
# make maps
map_obs_cont_test <- ggplot() +
geom_spatraster(data = pred_glm1_test_RAST, aes(fill = TotalTreeCover), maxcell = 50000000) +
#geom_sf(data = pred_glm1_full_RAST, aes(col = TotalTreeCover)) +
# stat_summary_2d(data = plotObs, aes(x = x, y = y, z = TotalTreeCover), fun = mean, binwidth = .05) +
geom_sf(data=cropped_states_2 ,fill=NA ) +
geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
labs(title = paste0("Observations of Continuous Total Tree Cover - test data")) +
scale_fill_gradient2(low = "brown",
mid = "wheat" ,
high = "forestgreen" ,
midpoint = 0, na.value = "white") +
coord_sf(datum = st_crs(pred_glm1_full_RAST)) +
# xlim(-104, -100) +
# ylim(30, 33)
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
# make figure of binomialized tree cover
map_obs_test <- ggplot() +
geom_spatraster(data = pred_glm1_test_RAST, aes(fill = TotalTreeCover_binom), maxcell = 50000000) +
#geom_sf(data = plotObs, aes(col = TotalTreeCover_binom)) +
#stat_summary_2d(data = plotObs, aes(x = x, y = y, z = TotalTreeCover_binom), fun = mean, binwidth = .05) +
scale_fill_viridis_c(option = "A", guide = guide_colorbar(title = "% cover"),
limits = c(0,100)) +
geom_sf(data=cropped_states_2 ,fill=NA ) +
geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
labs(title = paste0("Observations of Binomial Total Tree Cover - test data")) +
scale_fill_continuous(breaks = c(0, 0.5, 1), palette = c("wheat", "forestgreen"),
labels = c("No Trees",
"",
"Trees"), na.value = "white") +
coord_sf(datum = st_crs(pred_glm1_full_RAST)) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
hist_obs_cont_test <- pred_glm1_test %>%
ggplot() +
geom_histogram(aes(TotalTreeCover), fill = "lightgrey", col = "darkgrey")
hist_obs_test <- pred_glm1_test%>%
ggplot() +
geom_histogram(aes(TotalTreeCover_binom), fill = "lightgrey", col = "darkgrey")
ggarrange(map_obs_cont_test, hist_obs_cont_test, map_obs_test, hist_obs_test, heights = c(3, 1, 3, 1), ncol = 1)
# make figures of predictions w/ the full dataset
map_preds1_cont <- ggplot() +
geom_spatraster(data = pred_glm1_full_RAST, aes(fill = TotalTreeCover_binom_pred), maxcell = 20000000) +
#geom_sf(data = plotObs_rast2, aes(col = TotalTreeCover)) +
# stat_summary_2d(data = plotObs, aes(x = x, y = y, z = TotalTreeCover), fun = mean, binwidth = .05) +
geom_sf(data=cropped_states_2 ,fill=NA ) +
geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
labs(title = paste0("Predictions from the bestaLambda model of Yes/No Trees \ncontinuous probabilities \nfull dataset")) +
scale_fill_gradient2(low = "brown",
mid = "wheat" ,
high = "forestgreen" ,
midpoint = 0, na.value = "white") +
coord_sf(datum = st_crs(pred_glm1_full_RAST)) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
# make figure of binomialized tree cover
map_preds1 <- ggplot() +
geom_spatraster(data = pred_glm1_full_RAST, aes(fill = TotalTreeCover_binom_pred_binary), maxcell = 20000000) +
#geom_sf(data = plotObs, aes(col = TotalTreeCover_binom)) +
#stat_summary_2d(data = plotObs, aes(x = x, y = y, z = TotalTreeCover_binom), fun = mean, binwidth = .05) +
scale_fill_viridis_c(option = "A", guide = guide_colorbar(title = "% cover"),
limits = c(0,100)) +
geom_sf(data=cropped_states_2 ,fill=NA ) +
geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
labs(title = paste0("Predictions from the bestaLambda model of Yes/No Trees \nbinary probabilities \nfull dataset")) +
scale_fill_continuous(breaks = c(0, 0.5, 1), palette = c("wheat", "forestgreen"),
labels = c("No Trees",
"",
"Trees"), na.value = "white") +
coord_sf(datum = st_crs(pred_glm1_full_RAST)) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
hist_preds1_cont <- pred_glm1_full %>%
ggplot() +
geom_histogram(aes(TotalTreeCover_binom_pred), fill = "lightgrey", col = "darkgrey")
hist_preds1 <- pred_glm1_full%>%
ggplot() +
geom_histogram(aes(TotalTreeCover_binom_pred_binary), fill = "lightgrey", col = "darkgrey")
ggarrange(map_preds1_cont, hist_preds1_cont, map_preds1, hist_preds1, heights = c(3, 1, 3, 1), ncol = 1)
# make figures of predictions w/ the test dataset
map_preds1_cont_test <- ggplot() +
geom_spatraster(data = pred_glm1_test_RAST, aes(fill = TotalTreeCover_binom_pred), maxcell = 20000000) +
#geom_sf(data = plotObs_rast2, aes(col = TotalTreeCover)) +
# stat_summary_2d(data = plotObs, aes(x = x, y = y, z = TotalTreeCover), fun = mean, binwidth = .05) +
geom_sf(data=cropped_states_2 ,fill=NA ) +
geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
labs(title = paste0("Predictions from the bestaLambda model of Yes/No Trees \ncontinuous probabilities \ntest dataset")) +
scale_fill_gradient2(low = "brown",
mid = "wheat" ,
high = "forestgreen" ,
midpoint = 0, na.value = "white") +
coord_sf(datum = st_crs(pred_glm1_full_RAST)) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
# make figure of binomialized tree cover
map_preds1_test <- ggplot() +
geom_spatraster(data = pred_glm1_test_RAST, aes(fill = TotalTreeCover_binom_pred_binary), maxcell = 20000000) +
#geom_sf(data = plotObs, aes(col = TotalTreeCover_binom)) +
#stat_summary_2d(data = plotObs, aes(x = x, y = y, z = TotalTreeCover_binom), fun = mean, binwidth = .05) +
scale_fill_viridis_c(option = "A", guide = guide_colorbar(title = "% cover"),
limits = c(0,100)) +
geom_sf(data=cropped_states_2 ,fill=NA ) +
geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
labs(title = paste0("Predictions from the bestaLambda model of Yes/No Trees \nbinary probabilities \ntest dataset")) +
scale_fill_continuous(breaks = c(0, 0.5, 1), palette = c("wheat", "forestgreen"),
labels = c("No Trees",
"",
"Trees"), na.value = "white") +
coord_sf(datum = st_crs(pred_glm1_full_RAST)) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
hist_preds1_cont_test <- pred_glm1_test %>%
ggplot() +
geom_histogram(aes(TotalTreeCover_binom_pred), fill = "lightgrey", col = "darkgrey")
hist_preds1_test <- pred_glm1_test%>%
ggplot() +
geom_histogram(aes(TotalTreeCover_binom_pred_binary), fill = "lightgrey", col = "darkgrey")
ggarrange(map_preds1_cont_test, hist_preds1_cont_test, map_preds1_test, hist_preds1_test, heights = c(3, 1, 3, 1), ncol = 1)
if ( name_secondBestMod == "Intercept_Only") {
print("The next best lambda model only contains one predictor (an intercept)")
} else {
pred_glm1_1se_full_sf <- pred_glm1_1se_full %>%
st_as_sf(coords = c("x", "y"), crs = crs("EPSG:4326")) %>%
st_transform(crs(test_rast)) %>%
terra::vect()
# make a raster of the full dataset
# make a multi-layer raster w/ layers for each variable of interest
testRast3 <- lapply(c("TotalTreeCover", "TotalTreeCover_binom", "TotalTreeCover_binom_pred", "TotalTreeCover_binom_pred_binary", "resid", "missClass"), FUN = function(x) {
# S4 method for class 'data.frame'
if (x %in% c("TotalTreeCover", "TotalTreeCover_binom", "TotalTreeCover_binom_pred", "TotalTreeCover_binom_pred_binary", "resid")) {
temp <- terra::rasterize(x = pred_glm1_1se_full_sf, y = test_rast,
field = x, fun = mean, na.rm = TRUE)
names(temp) <- x
} else {
temp <- terra::rasterize(x = pred_glm1_1se_full_sf, y = test_rast,
field = x, fun = Mode, na.rm = TRUE)
# get the % aggreement between values in the dayMetx8 cell
temp2 <- terra::rasterize(x = pred_glm1_1se_full_sf, y = test_rast$daymet_v4_prcp_monttl_na_1980_1,
field = x, fun = function(y) {
yTemp <- y[!is.na(y)]
yMode <- Mode(yTemp)
(sum(yTemp == yMode)/length(yTemp))*100
})
temp <- c(as.factor(temp$missClass), temp2)
names(temp) <- c("missClass", "PercAgreement")
}
return(temp)
}
)
pred_glm1_1se_full_RAST <- c(testRast3[[1]], testRast3[[2]], testRast3[[3]], testRast3[[4]], testRast3[[5]], testRast3[[6]])
# make raster of test set
pred_glm1_1se_test_sf <- pred_glm1_1se_test %>%
st_as_sf(coords = c("x", "y"), crs = crs("EPSG:4326")) %>%
st_transform(crs(test_rast)) %>%
terra::vect()
# make a multi-layer raster w/ layers for each variable of interest
testRast_test2 <- lapply(c("TotalTreeCover", "TotalTreeCover_binom", "TotalTreeCover_binom_pred", "TotalTreeCover_binom_pred_binary", "resid", "missClass"), FUN = function(x) {
# S4 method for class 'data.frame'
if (x %in% c("TotalTreeCover", "TotalTreeCover_binom", "TotalTreeCover_binom_pred", "TotalTreeCover_binom_pred_binary", "resid")) {
temp <- terra::rasterize(x = pred_glm1_1se_test_sf, y = test_rast,
field = x, fun = mean, na.rm = TRUE)
names(temp) <- x
} else {
temp <- terra::rasterize(x = pred_glm1_1se_test_sf, y = test_rast,
field = x, fun = Mode, na.rm = TRUE)
# get the % aggreement between values in the dayMetx8 cell
temp2 <- terra::rasterize(x = pred_glm1_1se_test_sf, y = test_rast$daymet_v4_prcp_monttl_na_1980_1,
field = x, fun = function(y) {
yTemp <- y[!is.na(y)]
yMode <- Mode(yTemp)
(sum(yTemp == yMode)/length(yTemp))*100
})
temp <- c(as.factor(temp$missClass), temp2)
names(temp) <- c("missClass", "PercAgreement")
}
return(temp)
}
)
pred_glm1_1se_test_RAST <- c(testRast_test2[[1]], testRast_test2[[2]], testRast_test2[[3]], testRast_test2[[4]], testRast_test2[[5]], testRast_test2[[6]])
# make figures of predictions w/ the full dataset
map_preds2_cont <- ggplot() +
geom_spatraster(data = pred_glm1_1se_full_RAST, aes(fill = TotalTreeCover_binom_pred), maxcell = 20000000) +
#geom_sf(data = plotObs_rast2, aes(col = TotalTreeCover)) +
# stat_summary_2d(data = plotObs, aes(x = x, y = y, z = TotalTreeCover), fun = mean, binwidth = .05) +
geom_sf(data=cropped_states_2 ,fill=NA ) +
geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
labs(title = paste0("Predictions from the ", name_secondBestMod, " of Yes/No Trees \ncontinuous probabilities \nfull dataset")) +
scale_fill_gradient2(low = "brown",
mid = "wheat" ,
high = "forestgreen" ,
midpoint = 0, na.value = "white") +
coord_sf(datum = st_crs(pred_glm1_full_RAST)) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
# make figure of binomialized tree cover
map_preds2 <- ggplot() +
geom_spatraster(data = pred_glm1_1se_full_RAST, aes(fill = TotalTreeCover_binom_pred_binary), maxcell = 20000000) +
#geom_sf(data = plotObs, aes(col = TotalTreeCover_binom)) +
#stat_summary_2d(data = plotObs, aes(x = x, y = y, z = TotalTreeCover_binom), fun = mean, binwidth = .05) +
geom_sf(data=cropped_states_2 ,fill=NA ) +
geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
labs(title = paste0("Predictions from the ", name_secondBestMod, " of Yes/No Trees \nbinary probabilities \nfull dataset")) +
scale_fill_continuous(breaks = c(0, 0.5, 1), palette = c("wheat", "forestgreen"),
labels = c("No Trees",
"",
"Trees"), na.value = "white") +
coord_sf(datum = st_crs(pred_glm1_full_RAST)) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
hist_preds2_cont <- pred_glm1_1se_full %>%
ggplot() +
geom_histogram(aes(TotalTreeCover_binom_pred), fill = "lightgrey", col = "darkgrey")
hist_preds2 <- pred_glm1_1se_full%>%
ggplot() +
geom_histogram(aes(TotalTreeCover_binom_pred_binary), fill = "lightgrey", col = "darkgrey")
# make figures of predictions w/ the test dataset
map_preds2_cont_test <- ggplot() +
geom_spatraster(data = pred_glm1_1se_test_RAST, aes(fill = TotalTreeCover_binom_pred), maxcell = 20000000) +
#geom_sf(data = plotObs_rast2, aes(col = TotalTreeCover)) +
# stat_summary_2d(data = plotObs, aes(x = x, y = y, z = TotalTreeCover), fun = mean, binwidth = .05) +
geom_sf(data=cropped_states_2 ,fill=NA ) +
geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
labs(title = paste0("Predictions from the ", name_secondBestMod, " of Yes/No Trees \ncontinuous probabilities \ntest dataset")) +
scale_fill_gradient2(low = "brown",
mid = "wheat" ,
high = "forestgreen" ,
midpoint = 0, na.value = "white") +
coord_sf(datum = st_crs(pred_glm1_full_RAST)) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
# make figure of binomialized tree cover
map_preds2_test <- ggplot() +
geom_spatraster(data = pred_glm1_1se_test_RAST, aes(fill = TotalTreeCover_binom_pred_binary), maxcell = 20000000) +
#geom_sf(data = plotObs, aes(col = TotalTreeCover_binom)) +
#stat_summary_2d(data = plotObs, aes(x = x, y = y, z = TotalTreeCover_binom), fun = mean, binwidth = .05) +
geom_sf(data=cropped_states_2 ,fill=NA ) +
geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
labs(title = paste0("Predictions from the ", name_secondBestMod, " of Yes/No Trees \nbinary probabilities \ntest dataset")) +
scale_fill_continuous(breaks = c(0, 0.5, 1), palette = c("wheat", "forestgreen"),
labels = c("No Trees",
"",
"Trees"), na.value = "white") +
coord_sf(datum = st_crs(pred_glm1_full_RAST)) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
hist_preds2_cont_test <- pred_glm1_1se_test %>%
ggplot() +
geom_histogram(aes(TotalTreeCover_binom_pred), fill = "lightgrey", col = "darkgrey")
hist_preds2_test <- pred_glm1_1se_test%>%
ggplot() +
geom_histogram(aes(TotalTreeCover_binom_pred_binary), fill = "lightgrey", col = "darkgrey")
ggarrange(ggarrange(map_preds2_cont, hist_preds2_cont, map_preds2, hist_preds2, heights = c(3, 1, 3, 1), ncol = 1),
ggarrange(map_preds2_cont_test, hist_preds2_cont_test, map_preds2_test, hist_preds2_test, heights = c(3, 1, 3, 1), ncol = 1),
ncol = 1
)
}
# true positive = tree (1) (forest green)
# true negative = no tree (3) (wheat)
# false positive = model says tree, data says no-tree (2) (red)
# false negative = model says no tree, data says tree (4) (blue)
# make figures
map_missClass <- ggplot() +
geom_spatraster(data =pred_glm1_full_RAST
, aes(fill= missClass), maxcell = 20000000) +
#stat_summary_2d(data = pred_glm1, aes(x = x, y = y, z = missClass), fun = function(x) {round(mean(x))}, binwidth = .05) +
geom_sf(data=cropped_states_2,fill=NA ) +
geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
#geom_sf(data = badResids_high, col = "blue") +
#geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Misclassification and correct classifications \nfrom the bestLambda model of Yes/No Trees \nfull dataset"),
subtitle = "bestLambda model \nvalues show are the most common classification in each 8km x 8km grid cell") +
scale_fill_discrete(palette =
c("forestgreen", "red", "wheat", "blue"),
na.value = "white",
labels = c("forest", "Data = no trees; Model = trees", "non-forest", "Data = trees; Model = no trees", "")) +
coord_sf(datum = st_crs(pred_glm1_full_RAST)) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
# make a map of the aggreement in classification w/in each 8x8km cell
map_missClass_ag <- ggplot() +
geom_spatraster(data = pred_glm1_full_RAST, aes(fill= PercAgreement), maxcell = 20000000) +
#stat_summary_2d(data = pred_glm1, aes(x = x, y = y, z = missClass), fun = function(x) {round(mean(x))}, binwidth = .05) +
geom_sf(data=cropped_states_2,fill=NA ) +
geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
#geom_sf(data = badResids_high, col = "blue") +
#geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("% Agreement of the classification values within each 8x8km^2 gridcell \nclassifications using the bestLambda model of Yes/No Trees \nfull dataset"))+
scale_fill_viridis_c(option = "B", na.value = "white", name = "% agreement", ) +
coord_sf(datum = st_crs(pred_glm1_full_RAST)) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
ggarrange(map_missClass, map_missClass_ag, ncol = 1, heights = c(1, .7), align = "v")
# make a confusion matrix
# prepare data
matData <- pred_glm1_full %>%
mutate(predClass = TotalTreeCover_binom_pred_binary,
obsClass = TotalTreeCover_binom) %>%
mutate(predClass = replace(predClass, predClass == 0, "no trees"),
predClass = replace(predClass, predClass == 1, "trees"),
obsClass = replace(obsClass, obsClass == 0, "no trees"),
obsClass = replace(obsClass, obsClass == 1, "trees")) %>%
mutate(predClass = as.factor(predClass) ,
obsClass = as.factor(obsClass))
# make matrix as a data.frame
# confMat <- confusionMatrix(data = matData$predClass,
# reference = matData$obsClass)
ConfusionTableR::binary_visualiseR(
train_labels = as.factor(matData$predClass),
truth_labels = as.factor(matData$obsClass),
class_label1 = "No trees",
class_label2 = "Trees",
quadrant_col1 = "wheat",
quadrant_col2 = "forestgreen",
text_col = "white", custom_title = "Confusion Matrix: classification of full dataset \nwith the best lambda model using the full dataset"
)
### test dataset
# make figures
map_missClass_test <- ggplot() +
geom_spatraster(data =pred_glm1_test_RAST, aes(fill= missClass), maxcell = 20000000) +
#stat_summary_2d(data = pred_glm1, aes(x = x, y = y, z = missClass), fun = function(x) {round(mean(x))}, binwidth = .05) +
geom_sf(data=cropped_states_2,fill=NA ) +
geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
#geom_sf(data = badResids_high, col = "blue") +
#geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Misclassification and correct classifications \nfrom the bestLambda model of Yes/No Trees \ntest dataset"),
subtitle = "bestLambda model \nvalues show are the most common classification in each 8km x 8km grid cell") +
scale_fill_discrete(palette =
c("forestgreen", "red", "wheat", "blue"),
na.value = "white",
labels = c("forest", "Data = no trees; Model = trees", "non-forest", "Data = trees; Model = no trees", "")) +
coord_sf(datum = st_crs(pred_glm1_full_RAST)) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
map_missClass_test_ag <- ggplot() +
geom_spatraster(data = pred_glm1_test_RAST, aes(fill= PercAgreement), maxcell = 20000000) +
#stat_summary_2d(data = pred_glm1, aes(x = x, y = y, z = missClass), fun = function(x) {round(mean(x))}, binwidth = .05) +
geom_sf(data=cropped_states_2,fill=NA ) +
geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
#geom_sf(data = badResids_high, col = "blue") +
#geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("% Agreement of the classification values within each 8x8km^2 gridcell \nclassifications using the bestLambda model of Yes/No Trees \ntest dataset"))+
scale_fill_viridis_c(option = "B", na.value = "white", name = "% agreement", ) +
coord_sf(datum = st_crs(pred_glm1_full_RAST)) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
ggarrange(map_missClass_test, map_missClass_test_ag, ncol = 1, heights = c(1, .7), align = "v")
# make a confusion matrix
# prepare data
matData_test <- pred_glm1_test %>%
mutate(predClass = TotalTreeCover_binom_pred_binary,
obsClass = TotalTreeCover_binom) %>%
mutate(predClass = replace(predClass, predClass == 0, "no trees"),
predClass = replace(predClass, predClass == 1, "trees"),
obsClass = replace(obsClass, obsClass == 0, "no trees"),
obsClass = replace(obsClass, obsClass == 1, "trees")) %>%
mutate(predClass = as.factor(predClass) ,
obsClass = as.factor(obsClass))
# make matrix as a data.frame
# confMat <- confusionMatrix(data = matData$predClass,
# reference = matData$obsClass)
ConfusionTableR::binary_visualiseR(
train_labels = as.factor(matData_test$predClass),
truth_labels = as.factor(matData_test$obsClass),
class_label1 = "No trees",
class_label2 = "Trees",
quadrant_col1 = "wheat",
quadrant_col2 = "forestgreen",
text_col = "white", custom_title = "Confusion Matrix: classification of test dataset \nwith the best lambda model using the test dataset"
)
### full dataset
# make figures
map_missClass <- ggplot() +
geom_spatraster(data =pred_glm1_1se_full_RAST, aes(fill= missClass), maxcell = 20000000) +
#stat_summary_2d(data = pred_glm1, aes(x = x, y = y, z = missClass), fun = function(x) {round(mean(x))}, binwidth = .05) +
geom_sf(data=cropped_states_2,fill=NA ) +
geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
#geom_sf(data = badResids_high, col = "blue") +
#geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Misclassification and correct classifications \nfrom the second-best lambda model of Yes/No Trees \nfull dataset"),
subtitle = "second-best lambda model \nvalues show are the most common classification in each 8km x 8km grid cell") +
scale_fill_discrete(palette =
c("forestgreen", "red", "wheat", "blue"),
na.value = "white",
labels = c("forest", "Data = no trees; Model = trees", "non-forest", "Data = trees; Model = no trees", "")) +
coord_sf(datum = st_crs(pred_glm1_full_RAST)) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
map_missClass_ag <- ggplot() +
geom_spatraster(data = pred_glm1_1se_full_RAST, aes(fill= PercAgreement), maxcell = 20000000) +
#stat_summary_2d(data = pred_glm1, aes(x = x, y = y, z = missClass), fun = function(x) {round(mean(x))}, binwidth = .05) +
geom_sf(data=cropped_states_2,fill=NA ) +
geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
#geom_sf(data = badResids_high, col = "blue") +
#geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("% Agreement of the classification values within each 8x8km^2 gridcell \nclassifications using the second-best lambda model of Yes/No Trees \nfull dataset"))+
scale_fill_viridis_c(option = "B", na.value = "white", name = "% agreement", ) +
coord_sf(datum = st_crs(pred_glm1_full_RAST)) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
ggarrange(map_missClass, map_missClass_ag, ncol = 1, heights = c(1, .7), align = "v")
# make a confusion matrix
# prepare data
matData <- pred_glm1_1se_full %>%
mutate(predClass = TotalTreeCover_binom_pred_binary,
obsClass = TotalTreeCover_binom) %>%
mutate(predClass = replace(predClass, predClass == 0, "no trees"),
predClass = replace(predClass, predClass == 1, "trees"),
obsClass = replace(obsClass, obsClass == 0, "no trees"),
obsClass = replace(obsClass, obsClass == 1, "trees")) %>%
mutate(predClass = as.factor(predClass) ,
obsClass = as.factor(obsClass))
# make matrix as a data.frame
# confMat <- confusionMatrix(data = matData$predClass,
# reference = matData$obsClass)
ConfusionTableR::binary_visualiseR(
train_labels = as.factor(matData$predClass),
truth_labels = as.factor(matData$obsClass),
class_label1 = "No trees",
class_label2 = "Trees",
quadrant_col1 = "wheat",
quadrant_col2 = "forestgreen",
text_col = "white", custom_title = "Confusion Matrix: classification of full dataset \nwith the second-best lambda model using the full dataset"
)
### test dataset
# make figures
map_missClass_test <- ggplot() +
geom_spatraster(data =pred_glm1_1se_test_RAST, aes(fill= missClass), maxcell = 20000000) +
#stat_summary_2d(data = pred_glm1, aes(x = x, y = y, z = missClass), fun = function(x) {round(mean(x))}, binwidth = .05) +
geom_sf(data=cropped_states_2,fill=NA ) +
geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
#geom_sf(data = badResids_high, col = "blue") +
#geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("Misclassification and correct classifications \nfrom the second-best lambda model of Yes/No Trees \ntest dataset"),
subtitle = "second-best lambda model \nvalues show are the most common classification in each 8km x 8km grid cell") +
scale_fill_discrete(palette =
c("forestgreen", "red", "wheat", "blue"),
na.value = "white",
labels = c("forest", "Data = no trees; Model = trees", "non-forest", "Data = trees; Model = no trees", "")) +
coord_sf(datum = st_crs(pred_glm1_full_RAST)) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
map_missClass_test_ag <- ggplot() +
geom_spatraster(data = pred_glm1_1se_test_RAST, aes(fill= PercAgreement), maxcell = 20000000) +
#stat_summary_2d(data = pred_glm1, aes(x = x, y = y, z = missClass), fun = function(x) {round(mean(x))}, binwidth = .05) +
geom_sf(data=cropped_states_2,fill=NA ) +
geom_sf(data = mapRegions, fill = NA, col = "orchid", lwd = .5) +
#geom_sf(data = badResids_high, col = "blue") +
#geom_sf(data = badResids_low, col = "red") +
labs(title = paste0("% Agreement of the classification values within each 8x8km^2 gridcell \nclassifications using the second-best lambda model of Yes/No Trees \ntest dataset"))+
scale_fill_viridis_c(option = "B", na.value = "white", name = "% agreement", ) +
coord_sf(datum = st_crs(pred_glm1_full_RAST)) +
xlim(-2000000, 2500000) +
ylim(-1800000, 900000)
ggarrange(map_missClass_test, map_missClass_test_ag, ncol = 1, heights = c(1, .7), align = "v")
# make a confusion matrix
# prepare data
matData_test <- pred_glm1_1se_test %>%
mutate(predClass = TotalTreeCover_binom_pred_binary,
obsClass = TotalTreeCover_binom) %>%
mutate(predClass = replace(predClass, predClass == 0, "no trees"),
predClass = replace(predClass, predClass == 1, "trees"),
obsClass = replace(obsClass, obsClass == 0, "no trees"),
obsClass = replace(obsClass, obsClass == 1, "trees")) %>%
mutate(predClass = as.factor(predClass) ,
obsClass = as.factor(obsClass))
# make matrix as a data.frame
# confMat <- confusionMatrix(data = matData$predClass,
# reference = matData$obsClass)
ConfusionTableR::binary_visualiseR(
train_labels = as.factor(matData_test$predClass),
truth_labels = as.factor(matData_test$obsClass),
class_label1 = "No trees",
class_label2 = "Trees",
quadrant_col1 = "wheat",
quadrant_col2 = "forestgreen",
text_col = "white", custom_title = "Confusion Matrix: classification of test dataset \nwith the second-best lambda model using the test dataset"
)
if ( name_secondBestMod == "Intercept_Only") {
print("The next best lambda model only contains one predictor (an intercept)")
## calculate Misclassification in the input pred_glm1 or pred_glm1se datasets
pred_glm1_full$missClass2 <- pred_glm1_full$TotalTreeCover_binom - pred_glm1_full$TotalTreeCover_binom_pred_binary
pred_glm1_test$missClass2 <- pred_glm1_test$TotalTreeCover_binom - pred_glm1_test$TotalTreeCover_binom_pred_binary
# plot misclassifications against Year
yearResidMod_bestLambda <- ggplot(pred_glm1_full) +
geom_point(aes(x = jitter(Year), y = jitter(missClass2)), alpha = .1) +
geom_hline(aes(yintercept = 0), lty = 2, col = "red") +
geom_smooth(aes(x = Year, y = missClass2), method = "lm") +
xlab("Year") +
ylab("misclassifications") +
ggtitle("from best lamba model")
# plot misclassifications against Lat
latResidMod_bestLambda <- ggplot(pred_glm1_full) +
geom_point(aes(x = y, y = jitter(missClass2)), alpha = .1) +
geom_hline(aes(yintercept = 0), lty = 2, col = "red") +
geom_smooth(aes(x = y, y = missClass2)) +
xlab("Latitude") +
ylab("misclassifications") +
ggtitle("from best lamba model")
# plot misclassifications against Long
longResidMod_bestLambda <- ggplot(pred_glm1_full) +
geom_point(aes(x = x, y = jitter(missClass2)), alpha = .1) +
geom_hline(aes(yintercept = 0), lty = 2, col = "red") +
geom_smooth(aes(x = x, y = missClass2)) +
xlab("Longitude") +
ylab("misclassifications") +
ggtitle("from best lamba model")
library(patchwork)
(yearResidMod_bestLambda ) /
( latResidMod_bestLambda ) /
( longResidMod_bestLambda )
} else {
## calculate missClass2ification in the input pred_glm1 or pred_glm1se datasets
pred_glm1_full$missClass2 <- pred_glm1_full$TotalTreeCover_binom - pred_glm1_full$TotalTreeCover_binom_pred_binary
pred_glm1_test$missClass2 <- pred_glm1_test$TotalTreeCover_binom - pred_glm1_test$TotalTreeCover_binom_pred_binary
pred_glm1_1se_full$missClass2 <- pred_glm1_1se_full$TotalTreeCover_binom - pred_glm1_1se_full$TotalTreeCover_binom_pred_binary
pred_glm1_1se_test$missClass2 <- pred_glm1_1se_test$TotalTreeCover_binom - pred_glm1_1se_test$TotalTreeCover_binom_pred_binary
# plot misclassifications against Year
yearResidMod_bestLambda <- ggplot(pred_glm1_full) +
geom_point(aes(x = jitter(Year), y = jitter(missClass2)), alpha = .1) +
geom_hline(aes(yintercept = 0), lty = 2, col = "red") +
geom_smooth(aes(x = Year, y = missClass2), method = "lm") +
xlab("Year") +
ylab("misclassifications") +
ggtitle("from best lamba model")
yearResidMod_1seLambda <- ggplot(pred_glm1_1se_full) +
geom_point(aes(x = jitter(Year), y = jitter(missClass2)), alpha = .1) +
geom_hline(aes(yintercept = 0), lty = 2, col = "red") +
geom_smooth(aes(x = Year, y = missClass2), method = "lm") +
xlab("Year") +
ylab("misclassifications") +
ggtitle(paste0("from ", name_secondBestMod))
# plot misclassifications against Lat
latResidMod_bestLambda <- ggplot(pred_glm1_full) +
geom_point(aes(x = y, y = jitter(missClass2)), alpha = .1) +
geom_hline(aes(yintercept = 0), lty = 2, col = "red") +
geom_smooth(aes(x = y, y = missClass2)) +
xlab("Latitude") +
ylab("misclassifications") +
ggtitle("from best lamba model")
latResidMod_1seLambda <- ggplot(pred_glm1_1se_full) +
geom_point(aes(x = y, y = jitter(missClass2)), alpha = .1) +
geom_hline(aes(yintercept = 0), lty = 2, col = "red") +
geom_smooth(aes(x = y, y = missClass2)) +
xlab("Latitude") +
ylab("misclassifications") +
ggtitle(paste0("from ", name_secondBestMod))
# plot misclassifications against Long
longResidMod_bestLambda <- ggplot(pred_glm1_full) +
geom_point(aes(x = x, y = jitter(missClass2)), alpha = .1) +
geom_hline(aes(yintercept = 0), lty = 2, col = "red") +
geom_smooth(aes(x = x, y = missClass2)) +
xlab("Longitude") +
ylab("misclassifications") +
ggtitle("from best lamba model")
longResidMod_1seLambda <- ggplot(pred_glm1_1se_full) +
geom_point(aes(x = x, y = jitter(missClass2)), alpha = .1) +
geom_hline(aes(yintercept = 0), lty = 2, col = "red") +
geom_smooth(aes(x = x, y = missClass2)) +
xlab("Longitude") +
ylab("misclassifications") +
ggtitle(paste0("from ", name_secondBestMod))
library(patchwork)
(yearResidMod_bestLambda + yearResidMod_1seLambda) /
( latResidMod_bestLambda + latResidMod_1seLambda) /
( longResidMod_bestLambda + longResidMod_1seLambda)
}
Binning predictor variables into “Quantiles”and looking at the mean predicted probability for each quantile
response_vars <- c("TotalTreeCover_binom", "TotalTreeCover_binom_pred")
# get deciles for best lambda model
if (length(prednames_fig) == 0) {
print("The best lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles <- predvars2deciles(pred_glm1_full,
response_vars = response_vars,
pred_vars = prednames_fig,
cut_points = seq(0, 1, 0.01))
}
# get deciles for 1 SE lambda model
if ( name_secondBestMod == "Intercept_Only") {
print("The next best lambda model only contains one predictor (an intercept)")} else {
pred_glm1_deciles_1se <- predvars2deciles(pred_glm1_1se_full,
response_vars = response_vars,
pred_vars = prednames_secondBestMod,
cut_points = seq(0, 1, 0.01))
}
Below are quantile plots for the best lambda model (note that the predictor variables are scaled)
if (length(prednames_fig) == 0) {
print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
# publication quality version
g3 <- decile_dotplot_pq(df = pred_glm1_deciles, response = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred")), IQR = FALSE,
CI = FALSE
) + ggtitle("Decile Plot")
g4 <- add_dotplot_inset(g3, df = pred_glm1_deciles, response = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred")), dfRaw = pred_glm1_full, add_smooth = TRUE, deciles = FALSE)
if(save_figs) {
# png(paste0("../../../Figures/CoverDatFigures/ figures/quantile_plots/quantile_plot_", response, "_",ecoregion,".png"),
# units = "in", res = 600, width = 5.5, height = 3.5 )
# print(g4)
# dev.off()
}
g4
}
Below are quantile plots from the second best lambda model ()
if ( name_secondBestMod == "Intercept_Only") {
print("The next best lambda model only contains one predictor (an intercept)")
} else {
# publication quality version
g3 <- decile_dotplot_pq(pred_glm1_deciles_1se, response = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred")), IQR = FALSE) + ggtitle("Decile Plot")
g4 <- add_dotplot_inset(g3, df = pred_glm1_deciles_1se, response = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred")), dfRaw = pred_glm1_1se_full, add_smooth = TRUE, deciles = FALSE)
if(save_figs) {
# png(paste0("figures/quantile_plots/quantile_plot_", response, "_",ecoregion,".png"),
# units = "in", res = 600, width = 5.5, height = 3.5 )
# print(g4)
# dev.off()
}
g4
}
Filtered quantile plots of data. These plots show each vegetation variable, but only based on data that falls into the upper and lower 20th percentiles of each predictor variable.
if (length(prednames_fig) == 0) {
print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles_filt <- predvars2deciles( pred_glm1_full,
response_vars = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred")),
pred_vars = prednames_fig,
filter_var = TRUE,
filter_vars = prednames_fig,
cut_points = seq(0, 1, 0.01))
g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt, response = "TotalTreeCover_binom",
xvars = prednames_fig)
if(save_figs) {
# jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", , ".jpeg"),
# units = "in", res = 600, width = 5.5, height = 6 )
# g
# dev.off()
}
g
}
Filtered quantile figure with middle 2 deciles also shown
if ( name_secondBestMod == "Intercept_Only") {
print("The next best lambda model only contains one predictor (an intercept)")
} else {
pred_glm1_deciles_filt_mid <- predvars2deciles(pred_glm1_full,
response_vars = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred")),
pred_vars = prednames_fig,
filter_vars = prednames_fig,
filter_var = TRUE,
add_mid = TRUE,
cut_points = seq(0, 1, 0.01))
g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt_mid, response = "TotalTreeCover_binom",
xvars = prednames_fig)
if(save_figs) {
# jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", , ".jpeg"),
# units = "in", res = 600, width = 5.5, height = 6)
# g
# dev.off()
}
g
}
Filtered ‘Quantile’ plots of data. These plots show each vegetation variable, but only based on data that falls into the upper and lower 20th percentiles of each predictor variable.
if (length(prednames_secondBestMod) == 0) {
print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles_filt <- predvars2deciles( pred_glm1_1se_full,
response_vars = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred")),
pred_vars = prednames_secondBestMod,
filter_var = TRUE,
filter_vars = prednames_secondBestMod,
cut_points = seq(0, 1, 0.01))
g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt, response = "TotalTreeCover_binom",
xvars = prednames_fig)
if(save_figs) {
# jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", , ".jpeg"),
# units = "in", res = 600, width = 5.5, height = 6 )
# g
# dev.off()
}
g
}
Filtered quantile figure with middle 2 deciles also shown
if (length(prednames_secondBestMod) == 0) {
print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles_filt_mid <- predvars2deciles(pred_glm1_1se_full,
response_vars = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred")),
pred_vars = prednames_secondBestMod,
filter_vars = prednames_secondBestMod,
filter_var = TRUE,
add_mid = TRUE,
cut_points = seq(0, 1, 0.01))
g <- decile_dotplot_filtered_pq(df = pred_glm1_deciles_filt_mid, response = "TotalTreeCover_binom",
xvars = prednames_fig)
if(save_figs) {
# jpeg(paste0("figures/quantile_plots/quantile_plot_filtered_mid_v1", , ".jpeg"),
# units = "in", res = 600, width = 5.5, height = 6 )
# g
# dev.off()
}
g
}
response_vars <- c("TotalTreeCover_binom", "TotalTreeCover_binom_pred_binary")
# get deciles for best lambda model
if (length(prednames_fig) == 0) {
print("The best lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles <- predvars2deciles(pred_glm1_full,
response_vars = response_vars,
pred_vars = prednames_fig,
cut_points = seq(0, 1, 0.01))
}
# get deciles for 1 SE lambda model
if ( name_secondBestMod == "Intercept_Only") {
print("The next best lambda model only contains one predictor (an intercept)")} else {
pred_glm1_deciles_1se <- predvars2deciles(pred_glm1_1se_full,
response_vars = response_vars,
pred_vars = prednames_secondBestMod,
cut_points = seq(0, 1, 0.01))
}
Below are quantile plots for the best lambda model (note that the predictor variables are scaled)
if (length(prednames_fig) == 0) {
print("The model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
# #### code to generate plots analagous to "decile plots" by hand, do double check that they're suitable for use w/ binary predictions
# pred_glm1 %>%
# #slice_sample(n = 10000) %>%
# pivot_longer(cols = all_of(prednames_fig)) %>%
# select(Year, x, y, TotalTreeCover_binom, TotalTreeCover_binom_pred_binary, name, value) %>%
# ggplot() +
# facet_wrap(~name, scales = "free") +
# geom_point(aes(x = value, y = jitter(TotalTreeCover_binom)), alpha = .5) +
# geom_point(aes(x = value, y = jitter(TotalTreeCover_binom_pred_binary)), col = "blue", alpha = .5) +
# geom_smooth(aes(x = value, y = TotalTreeCover_binom), col = "black") +
# geom_smooth(aes(x = value, y = TotalTreeCover_binom_pred_binary), col = "blue")
#
#####
# publication quality version
g3 <- decile_dotplot_pq(df = pred_glm1_deciles, response = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred_binary")), IQR = FALSE,
CI = FALSE
) + ggtitle("Decile Plot")
g4 <- add_dotplot_inset(g3, df = pred_glm1_deciles, response = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred_binary")), dfRaw = pred_glm1_full, add_smooth = TRUE, deciles = FALSE)
if(save_figs) {
# png(paste0("../../../Figures/CoverDatFigures/ figures/quantile_plots/quantile_plot_", response, "_",ecoregion,".png"),
# units = "in", res = 600, width = 5.5, height = 3.5 )
# print(g4)
# dev.off()
}
g4
}
Below are quantile plots from the second best lambda model (not that the predictor variables are scaled)
if ( name_secondBestMod == "Intercept_Only") {
print("The next best lambda model only contains one predictor (an intercept)")
} else {
# publication quality version
g3 <- decile_dotplot_pq(pred_glm1_deciles_1se, response = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred_binary")), IQR = FALSE) + ggtitle("Decile Plot")
g4 <- add_dotplot_inset(g3, df = pred_glm1_deciles_1se, response = c("TotalTreeCover_binom", paste0("TotalTreeCover_binom", "_pred_binary")), dfRaw = pred_glm1_1se_full, add_smooth = TRUE, deciles = FALSE)
if(save_figs) {
# png(paste0("figures/quantile_plots/quantile_plot_", response, "_",ecoregion,".png"),
# units = "in", res = 600, width = 5.5, height = 3.5 )
# print(g4)
# dev.off()
}
g4
}
# get deciles for best lambda model
if (length(prednames_fig) == 0) {
print("The best lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles %>%
ggplot(aes(x = mean_value, y = RMSE)) +
facet_wrap(~name, scales = "free_x") +
geom_point(alpha = .2, size = .5) +
geom_smooth(lwd = .5) +
xlab("Scaled predictor value") +
ggtitle("RMSE by decile for bestLambda model")
}
# get deciles for 1 SE lambda model
if (length(prednames_secondBestMod) == 0) {
print("The 1SE (or 1/2 SE) lambda model only contains one predictor (an intercept), so decile plots aren't possible to generate")
} else {
pred_glm1_deciles_1se %>%
ggplot(aes(x = mean_value, y = RMSE)) +
facet_wrap(~name, scales = "free_x") +
geom_point(alpha = .2, size = .5) +
geom_smooth(lwd = .5) +
xlab("Scaled predictor value") +
ggtitle(paste0("RMSE by decile for ", name_secondBestMod, "model"))
}
# get the parameters that were used to scale the data
scaleParams <- modDat_1_s_temp %>%
filter(Year == 2016) %>%
dplyr::select(tmin_s:AWHC_s) %>%
reframe(across(all_of(names(.)), attributes))
# function for printing model statement
## function to get model statements
getModelStatement <- function(coefficientTable, # name of the d.f that has model coefficients
modelName, # name of the column in the coefficient table that has the parameters of interest
responseVar # name of the response variable
) {
##
# coefficientTable <- grassShrub_totalHerb_trimAnoms
# modelName <- "coefficientValue_bestLambda"
# responseVar <- "TotalHerbaceousCover"
##
temp <- coefficientTable[,c("coefficientName", modelName)] %>%
drop_na()
rownames(temp) <- temp$coefficientName
temp[,modelName] <- round(temp[,modelName], 8)
# print out coefficients w/ coefficient names
tempNames <- paste0(
apply(temp, MARGIN = 1, FUN = function(x) {
if (x["coefficientName"] == "(Intercept)") {
paste0(x[modelName])
} else {
paste0(x[modelName], "*", x["coefficientName"])
}
}
),
collapse = " + "
)
# print the unscaled model statement
unscaledModelName <- paste0(responseVar, "~ exp(", tempNames, ") - 2")
# now add in the scale parameters
tempNames <- str_replace_all(tempNames, pattern = "annWetDegDays_anom",
replacement = paste0("((annWetDegDays_anom - ",
round(scaleParams$annWetDegDays_anom_s$`scaled:center`,8), ") / ",
round(scaleParams$annWetDegDays_anom_s$`scaled:scale`,8), ")"))
tempNames <- str_replace_all(tempNames, pattern = "prcpTempCorr_anom",
replacement = paste0("((prcpTempCorr_anom - ",
round(scaleParams$prcpTempCorr_anom_s$`scaled:center`,8), ") / ",
round(scaleParams$prcpTempCorr_anom_s$`scaled:scale`,8), ")"))
tempNames <- str_replace_all(tempNames, pattern = "prcp_seasonality_anom",
replacement = paste0("((prcp_seasonality_anom - ",
round(scaleParams$prcp_seasonality_anom_s$`scaled:center`,8), ") / ",
round(scaleParams$prcp_seasonality_anom_s$`scaled:scale`,8), ") "))
tempNames <- str_replace_all(tempNames, pattern = "annWatDef_anom",
replacement = paste0("((annWatDef_anom - ",
round(scaleParams$annWatDef_anom_s$`scaled:center`,8), ") / ",
round(scaleParams$annWatDef_anom_s$`scaled:scale`,8), ") "))
tempNames <- str_replace_all(tempNames, pattern = "isothermality_anom",
replacement = paste0("((isothermality_anom - ",
round(scaleParams$isothermality_anom_s$`scaled:center`,8), ") / ",
round(scaleParams$isothermality_anom_s$`scaled:scale`,8), ") "))
tempNames <- str_replace_all(tempNames, pattern = "prcp_anom",
replacement = paste0("((prcp_anom - ",
round(scaleParams$prcp_anom_s$`scaled:center`,8), ") / ",
round(scaleParams$prcp_anom_s$`scaled:scale`,8), ") "))
tempNames <- str_replace_all(tempNames, pattern = "prcp ",
replacement = paste0("((prcp - ",
round(scaleParams$prcp_s$`scaled:center`,8), ") / ",
round(scaleParams$prcp_s$`scaled:scale`,8), ") "))
tempNames <- str_replace_all(tempNames, pattern = "prcp\\^",
replacement = paste0("((prcp - ",
round(scaleParams$prcp_s$`scaled:center`,8), ") / ",
round(scaleParams$prcp_s$`scaled:scale`,8), ")^"))
tempNames <- str_replace_all(tempNames, pattern = "prcp:",
replacement = paste0("((prcp - ",
round(scaleParams$prcp_s$`scaled:center`,8), ") / ",
round(scaleParams$prcp_s$`scaled:scale`,8), "):"))
tempNames <- str_replace_all(tempNames, pattern = "prcpTempCorr ",
replacement = paste0("((prcpTempCorr - ",
round(scaleParams$prcpTempCorr_s$`scaled:center`,8), ") / ",
round(scaleParams$prcpTempCorr_s$`scaled:scale`,8), ") "))
tempNames <- str_replace_all(tempNames, pattern = "prcpTempCorr\\^",
replacement = paste0("((prcpTempCorr - ",
round(scaleParams$prcpTempCorr_s$`scaled:center`,8), ") / ",
round(scaleParams$prcpTempCorr_s$`scaled:scale`,8), ")^"))
tempNames <- str_replace_all(tempNames, pattern = "prcpTempCorr:",
replacement = paste0("((prcpTempCorr - ",
round(scaleParams$prcpTempCorr_s$`scaled:center`,8), ") / ",
round(scaleParams$prcpTempCorr_s$`scaled:scale`,8), "):"))
tempNames <- str_replace_all(tempNames, pattern = "prcpTempCorr\\)",
replacement = paste0("((prcpTempCorr - ",
round(scaleParams$prcpTempCorr_s$`scaled:center`,8), ") / ",
round(scaleParams$prcpTempCorr_s$`scaled:scale`,8), "))"))
tempNames <- str_replace_all(tempNames, pattern = "prcpTempCorr$",
replacement = paste0("((prcpTempCorr - ",
round(scaleParams$prcpTempCorr_s$`scaled:center`,8), ") / ",
round(scaleParams$prcpTempCorr_s$`scaled:scale`,8), ")"))
tempNames <- str_replace_all(tempNames, pattern = "isothermality ",
replacement = paste0("((isothermality - ",
round(scaleParams$isothermality_s$`scaled:center`,8), ") / ",
round(scaleParams$isothermality_s$`scaled:scale`,8), ") "))
tempNames <- str_replace_all(tempNames, pattern = "isothermality\\^",
replacement = paste0("((isothermality - ",
round(scaleParams$isothermality_s$`scaled:center`,8), ") / ",
round(scaleParams$isothermality_s$`scaled:scale`,8), ")^"))
tempNames <- str_replace_all(tempNames, pattern = "isothermality:",
replacement = paste0("((isothermality - ",
round(scaleParams$isothermality_s$`scaled:center`,8), ") / ",
round(scaleParams$isothermality_s$`scaled:scale`,8), "):"))
tempNames <- str_replace_all(tempNames, pattern = "sand",
replacement = paste0("((sand - ",
round(scaleParams$sand_s$`scaled:center`,8), ") / ",
round(scaleParams$sand_s$`scaled:scale`,8), ")"))
tempNames <- str_replace_all(tempNames, pattern = "coarse",
replacement = paste0("((coarse - ",
round(scaleParams$coarse_s$`scaled:center`,8), ") / ",
round(scaleParams$coarse_s$`scaled:scale`,8), ")"))
tempNames <- str_replace_all(tempNames, pattern = "AWHC",
replacement = paste0("((AWHC - ",
round(scaleParams$AWHC_s$`scaled:center`,8), ") / ",
round(scaleParams$AWHC_s$`scaled:scale`,8), ")"))
tempNames <- str_replace_all(tempNames, pattern = "carbon",
replacement = paste0("((carbon - ",
round(scaleParams$carbon_s$`scaled:center`,8), ") / ",
round(scaleParams$carbon_s$`scaled:scale`,8), ")"))
tempNames <- str_replace_all(tempNames, pattern = "clay",
replacement = paste0("((clay - ",
round(scaleParams$clay_s$`scaled:center`,8), ") / ",
round(scaleParams$clay_s$`scaled:scale`,8), ")"))
tempNames <- str_replace_all(tempNames, pattern = "annWatDef ",
replacement = paste0("((annWatDef - ",
round(scaleParams$annWatDef_s$`scaled:center`,8), ") / ",
round(scaleParams$annWatDef_s$`scaled:scale`,8), ") "))
tempNames <- str_replace_all(tempNames, pattern = "annWatDef:",
replacement = paste0("((annWatDef - ",
round(scaleParams$annWatDef_s$`scaled:center`,8), ") / ",
round(scaleParams$annWatDef_s$`scaled:scale`,8), "):"))
tempNames <- str_replace_all(tempNames, pattern = "prcp_seasonality ",
replacement = paste0("((prcp_seasonality - ",
round(scaleParams$prcp_seasonality_s$`scaled:center`,8), ") / ",
round(scaleParams$prcp_seasonality_s$`scaled:scale`,8), ") "))
tempNames <- str_replace_all(tempNames, pattern = "prcp_seasonality:",
replacement = paste0("((prcp_seasonality - ",
round(scaleParams$prcp_seasonality_s$`scaled:center`,8), ") / ",
round(scaleParams$prcp_seasonality_s$`scaled:scale`,8), "):"))
tempNames <- str_replace_all(tempNames, pattern = "prcp_seasonality\\^",
replacement = paste0("((prcp_seasonality - ",
round(scaleParams$prcp_seasonality_s$`scaled:center`,8), ") / ",
round(scaleParams$prcp_seasonality_s$`scaled:scale`,8), ")^"))
tempNames <- str_replace_all(tempNames, pattern = "tmean ",
replacement = paste0("((tmean - ",
round(scaleParams$tmean_s$`scaled:center`,8), ") / ",
round(scaleParams$tmean_s$`scaled:scale`,8), ") "))
tempNames <- str_replace_all(tempNames, pattern = "tmean:",
replacement = paste0("((tmean - ",
round(scaleParams$tmean_s$`scaled:center`,8), ") / ",
round(scaleParams$tmean_s$`scaled:scale`,8), "):"))
tempNames <- str_replace_all(tempNames, pattern = "tmean$",
replacement = paste0("((tmean - ",
round(scaleParams$tmean_s$`scaled:center`,8), ") / ",
round(scaleParams$tmean_s$`scaled:scale`,8), ")"))
tempNames <- str_replace_all(tempNames, pattern = "tmean\\^",
replacement = paste0("((tmean - ",
round(scaleParams$tmean_s$`scaled:center`,8), ") / ",
round(scaleParams$tmean_s$`scaled:scale`,8), ")^"))
tempNames <- str_replace_all(tempNames, pattern = "annWetDegDays ",
replacement = paste0("((annWetDegDays - ",
round(scaleParams$annWetDegDays_s$`scaled:center`,8), ") / ",
round(scaleParams$annWetDegDays_s$`scaled:scale`,8), ") "))
tempNames <- str_replace_all(tempNames, pattern = "annWetDegDays:",
replacement = paste0("((annWetDegDays - ",
round(scaleParams$annWetDegDays_s$`scaled:center`,8), ") / ",
round(scaleParams$annWetDegDays_s$`scaled:scale`,8), "):"))
tempNames <- str_replace_all(tempNames, pattern = "prcp_dry ",
replacement = paste0("((prcp_dry - ",
round(scaleParams$prcp_dry_s$`scaled:center`,8), ") / ",
round(scaleParams$prcp_dry_s$`scaled:scale`,8), ") "))
tempNames <- str_replace_all(tempNames, pattern = "prcp_dry:",
replacement = paste0("((prcp_dry - ",
round(scaleParams$prcp_dry_s$`scaled:center`,8), ") / ",
round(scaleParams$prcp_dry_s$`scaled:scale`,8), "):"))
tempNames <- str_replace_all(tempNames, pattern = "annWatDef_95",
replacement = paste0("((annWatDef_95 - ",
round(scaleParams$annWatDef_95_s$`scaled:center`,8), ") / ",
round(scaleParams$annWatDef_95_s$`scaled:scale`,8), ")"))
## print scaled model statement
scaledModelName <- paste0(responseVar, "~ exp(", tempNames, ")")
return(list("scaledInputVars_ModelStatement" = unscaledModelName,
"unscaledInputVars_scaledModelStatement" = scaledModelName))
}
## bestLambda model equations
bestLambdaModStatement <- getModelStatement(coefficientTable = coefs,
modelName = "coefficientValue_bestLambda",
responseVar = "P(Tree)")
## second-best lambda model equations
secondBestLambdaModStatement <- getModelStatement(coefficientTable = coefs,
modelName = ifelse(name_secondBestMod == "1/2 SE Model",
yes = "coefficientValue_halfseLambda",
no = "coefficientValue_1seLambda"),
responseVar = "P(Tree)")
print(bestLambdaModStatement$scaledInputVars_ModelStatement)
## [1] "P(Tree)~ exp(-0.67478165 + 1.71345065*prcp + -0.01633932*prcp_seasonality + -0.91285243*prcpTempCorr + 0.53377439*annWetDegDays + -0.60523936*annWatDef_95 + 0.20578879*coarse + -0.61568228*AWHC + -0.31818631*I(tmean^2) + 0.20104126*I(prcp_seasonality^2) + -0.89055646*I(prcpTempCorr^2) + 0.14226917*I(isothermality^2) + -0.11810211*I(clay^2) + 0.11278028*I(AWHC^2) + 0.91656456*annWetDegDays:annWatDef_95 + -0.29650844*prcp_dry:isothermality + -0.02061048*prcpTempCorr:isothermality + -0.28504255*isothermality:tmean + 0.50549093*prcpTempCorr:prcp_dry + -0.25285633*prcpTempCorr:tmean + -0.16290284*AWHC:clay + 0.09163249*coarse:clay) - 2"
print(bestLambdaModStatement$unscaledInputVars_scaledModelStatement)
## [1] "P(Tree)~ exp(-0.67478165 + 1.71345065*((prcp - 483.34337211) / 321.52170566) + -0.01633932*((prcp_seasonality - 0.97338094) / 0.23365057) + -0.91285243*((prcpTempCorr - 0.01681097) / 0.39882063) + 0.53377439*((annWetDegDays - 1808.61043539) / 1083.25298131) + -0.60523936*((annWatDef_95 - 164.68611646) / 104.14892564) + 0.20578879*((coarse - 9.66120017) / 10.16883531) + -0.61568228*((AWHC - 14.61190409) / 5.65765542) + -0.31818631*I(((tmean - 11.0291275) / 4.89048541)^2) + 0.20104126*I(((prcp_seasonality - 0.97338094) / 0.23365057)^2) + -0.89055646*I(((prcpTempCorr - 0.01681097) / 0.39882063)^2) + 0.14226917*I(((isothermality - 37.76731082) / 5.2256935)^2) + -0.11810211*I(((clay - 19.59171509) / 8.51944876)^2) + 0.11278028*I(((AWHC - 14.61190409) / 5.65765542)^2) + 0.91656456*((annWetDegDays - 1808.61043539) / 1083.25298131):((annWatDef_95 - 164.68611646) / 104.14892564) + -0.29650844*((prcp_dry - 3.18622575) / 5.36400552):((isothermality - 37.76731082) / 5.2256935) + -0.02061048*((prcpTempCorr - 0.01681097) / 0.39882063):((isothermality - 37.76731082) / 5.2256935) + -0.28504255*((isothermality - 37.76731082) / 5.2256935):((tmean - 11.0291275) / 4.89048541) + 0.50549093*((prcpTempCorr - 0.01681097) / 0.39882063):((prcp_dry - 3.18622575) / 5.36400552) + -0.25285633*((prcpTempCorr - 0.01681097) / 0.39882063):((tmean - 11.0291275) / 4.89048541) + -0.16290284*((AWHC - 14.61190409) / 5.65765542):((clay - 19.59171509) / 8.51944876) + 0.09163249*((coarse - 9.66120017) / 10.16883531):((clay - 19.59171509) / 8.51944876))"
print(secondBestLambdaModStatement$scaledInputVars_ModelStatement)
## [1] "P(Tree)~ exp(-0.8616112 + 1.9694935*prcp + -0.3086721*prcp_seasonality + -0.5967287*prcpTempCorr + -0.7655963*AWHC + -0.6416890*I(prcpTempCorr^2) + -0.1661580*I(clay^2) + -0.4376182*prcp_dry:isothermality + -0.2506869*AWHC:clay) - 2"
print(secondBestLambdaModStatement$unscaledInputVars_scaledModelStatement)
## [1] "P(Tree)~ exp(-0.8616112 + 1.9694935*((prcp - 483.34337211) / 321.52170566) + -0.3086721*((prcp_seasonality - 0.97338094) / 0.23365057) + -0.5967287*((prcpTempCorr - 0.01681097) / 0.39882063) + -0.7655963*((AWHC - 14.61190409) / 5.65765542) + -0.6416890*I(((prcpTempCorr - 0.01681097) / 0.39882063)^2) + -0.1661580*I(((clay - 19.59171509) / 8.51944876)^2) + -0.4376182*((prcp_dry - 3.18622575) / 5.36400552):((isothermality - 37.76731082) / 5.2256935) + -0.2506869*((AWHC - 14.61190409) / 5.65765542):((clay - 19.59171509) / 8.51944876))"
We use the ‘modDat_1_s_temp’ data.frame, which has all of the cover data, even for locations without observed tree cover, but has the climate and soils data that have been scaled in that same was as the data we used to fit the yes/no tree model
# prep data
modDat_1_s_temp$rowID <- 1:nrow(modDat_1_s_temp)
predData <- modDat_1_s_temp %>%
select(rowID, Year, x, y, paste0(prednames, "_s")) %>%
rename_with(~str_remove(.x, "_s$"),
any_of(paste0(names(prednames), "_s")))
# for bestLambda model
predData$treeProb_bestLambda <- predict.glm(object = fit_glm_bestLambda, newdata = predData, type = "response")
# for scond-bestLambda model
predData$treeProb_secondBestLambda <- predict.glm(object = mod_secondBest, newdata = predData, type = "response")
# make continuous predictions binary according to the identified best threshold for each model
predData <- predData %>%
mutate(treeProb_bestLambda_binary = treeProb_bestLambda,
treeProb_bestLambda_binary = replace(treeProb_bestLambda_binary, treeProb_bestLambda_binary > best_threshold_bestLambda, 1),
treeProb_bestLambda_binary = replace(treeProb_bestLambda_binary, treeProb_bestLambda_binary <= best_threshold_bestLambda, 0)) %>%
mutate(treeProb_secondBestLambda_binary = treeProb_secondBestLambda,
treeProb_secondBestLambda_binary = replace(treeProb_secondBestLambda_binary, treeProb_secondBestLambda_binary > best_threshold_secondBest, 1),
treeProb_secondBestLambda_binary = replace(treeProb_secondBestLambda_binary, treeProb_secondBestLambda_binary <= best_threshold_secondBest, 0))
# add predictions back to the original data frame and save
modDat_treePreds <- modDat_1_s_temp %>%
left_join(predData %>% select(-c(tmean:AWHC)), by = c("rowID", "Year", "x", "y")) %>%
select(-c(tmin_s:AWHC_s))
saveRDS(modDat_treePreds,"../../../Data_processed/CoverData/CoverModelData_withTreeNoTreePreds.rds")